diff --git a/Cluster.cpp b/Cluster.cpp new file mode 100644 index 0000000..89fac7a --- /dev/null +++ b/Cluster.cpp @@ -0,0 +1,180 @@ +/* + A fast clustering algorithm for PPs guided by a tree +*/ + + +#include "Cluster.h" +#include "Tree.h" + +using namespace::std; +#include + +// Some initialisation +CCluster *CCluster::_cluster = NULL; +CCluster * CCluster::Instance() { + if(!_cluster) { + _cluster = new CCluster(); + } + return _cluster; +} + +// MakePairs() +// --- +// Computes the distance matrix from the tree and uses it to select a set of pairs for testing during divvying +// Gets the splits from the tree +void CCluster::MakePairs() { + assert(_tree.NoSeq() == NoSeq()); + // Splits + assert(_splits.empty()); + // All the splits + for(int i = 0; i < _tree.NoBra() ; i++) { + _splits.push_back(_tree.GetSplit( i )); + } + // Distance matrix + vector distances = _tree.GetTreePW(); + vector big, small; + vector > distSet; // The set of pairwise comparisons (int i, int j) and their distance (double) + vector > pairs2add; + for(SSplit &split : _splits) { + // Get the full list of pairwise comparisons in the splits + if(split.Left.size() > split.Right.size()) { + big = split.Left; small = split.Right; + } else { + big = split.Right; small = split.Left; + } + // Create the list of pairwise comparisons sorted by distance. x always small; y alway big + for(int &x : small) { + for(int &y : big) { + distSet.push_back(make_tuple(x,y,distances[(y*NoSeq()) + x])); + } + } + sort(begin(distSet), end(distSet), [](auto const &t1, auto const &t2) { + return get<2>(t1) < get<2>(t2); + }); + // Obtain samples. The rules are: + // 1. Unique sequences from the small data set are the priority, each with a different partner + vector small_count(NoSeq(),0),big_count(NoSeq(),0); + int small_max = ceil( ( (double) _approxNumber / (double) small.size() ) +1 ); + int big_max = ceil( ( (double) _approxNumber / (double) big.size() ) + 1); + for(int i = 0; i < distSet.size(); i++) { + if(small_count[get<0>(distSet[i])] >= small_max) { continue; } + if(big_count[get<1>(distSet[i])] >= big_max) { continue; } + pairs2add.push_back(vector{get<0>(distSet[i]),get<1>(distSet[i])}); + small_count[get<0>(distSet[i])]++; + big_count[get<1>(distSet[i])]++; + if(pairs2add.size() >= _approxNumber) { break; } // Finish when we have the full list + } + _pairs.push_back(pairs2add); + // Clean up + pairs2add.clear(); + distSet.clear(); + big.clear(); + small.clear(); + } + _ready = true; +} + +// Get the set of pairs for testing split numbered splitNum +vector > CCluster::GetPairs(int splitNum) { + return _pairs[splitNum]; +} + +vector > CCluster::PairsToCalculate() { + vector > retVec; + // Get the list. Note only upper 1/2 of diagonal + for(auto x : _pairs) { + for(auto y : x) { + if(y[0] > y[1]) { + retVec.push_back( vector{y[1],y[0]} ); + } else { + retVec.push_back( vector{y[0],y[1]} ); + } + } + } + if(retVec.size() == 0) { cout << "\nCCluster::PairsToCalculate() :: Need to have pairs to calculate... Have you initialised properly? Is the Tree wrong?\n"; exit(-1); } + // Sort it + sort(retVec.begin(), retVec.end(), [](auto x, auto y) { + return (x[1] < y[1] && x[0] < y[0]); + }); + sort(retVec.begin(), retVec.end(), [](vector x, vector y) { + if(x[0] < y[0]) { return true; } + if(x[0] > y[0]) { return false; } + if(x[1] < y[1]) { return true; } + return false; + }); + // Remove redundancy + for(int i = 0 ; i < retVec.size() - 1; i++) { + if(retVec[i+1][0] == retVec[i][0] && retVec[i+1][1] == retVec[i][1]) { + retVec.erase(retVec.begin() + i + 1); + i--; + } + } + return retVec; +} + +vector < vector > CCluster::OutputClusters(vector PPs, double threshold, int clusterMethod) { + assert(_ready); + vector > retSplits; + vector starter(NoSeq(),0); + for(int i = 0; i < NoSeq(); i++) { starter[i] = i; } + retSplits.push_back(starter); + // Find what clusters to make. Can change this function + for(int i = 0 ; i < _splits.size(); i++) { + if(!TestSplit(i,threshold,PPs,clusterMethod)) { + retSplits = AddSplit(i,retSplits); + } + } + // Sort them so they're in a nice order; the structure of hte tree splits mean they might not be + sort(retSplits.begin(), retSplits.end(),[](const vector& a, const vector& b) { + return a[0] < b[0]; + }); + return retSplits; +} + +// Calculates a test statistic based on PPs and compares it to threshold. If greater it passes and returns true +bool CCluster::TestSplit(int split2Test, double threshold, vector &PPs, int testMethod) { + double testStat = 0; + switch(testMethod) { + case 0: // Compute the mean and compare to threshold + for(vector &v : _pairs[split2Test]) { + assert(v.size() == 2); + testStat += PPs[(v[0] * NoSeq()) + v[1]]; + } + if(testStat / (double) _pairs[split2Test].size() + DBL_EPSILON >= threshold) { return true; } + break; + default: + cout << "\nUnknown testMethod passed to CCluster::TestSplit(...)\n"; exit(-1); + }; + return false; +} + +vector > CCluster::AddSplit(int split2Add, vector > curSplit) { + vector > retSplit; + // Find the split in curSplit that has elements from both Left and Right _split(split2Add) present + for(vector &split : curSplit) { + unordered_set split_set ( split.begin(),split.end() ); + bool inLeft = false, inRight = false; + for(int &left : _splits[split2Add].Left) { + if(split_set.find(left) != split_set.end()) { inLeft = true; break; } + } + for(int &right : _splits[split2Add].Right) { + if(split_set.find(right) != split_set.end()) { inRight = true; break; } + } + if(!inLeft || !inRight) { retSplit.push_back(split); continue; } + vector newSplit; + for(int &left : _splits[split2Add].Left) { + if(split_set.find(left) != split_set.end()) { newSplit.push_back(left); } + } + assert(newSplit.size() > 0); + retSplit.push_back(newSplit); // Note: no sorting required because they're pre-sorted + newSplit.clear(); + for(int &right : _splits[split2Add].Right) { + if(split_set.find(right) != split_set.end()) { newSplit.push_back(right); } + } + assert(newSplit.size() > 0); + retSplit.push_back(newSplit); // Note: no sorting required because they're pre-sorted + } + return retSplit; +} + + diff --git a/Cluster.h b/Cluster.h new file mode 100644 index 0000000..6f559e1 --- /dev/null +++ b/Cluster.h @@ -0,0 +1,46 @@ +/* + A fast clustering algorithm for PPs guided by a tree +*/ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Tree.h" + +class CCluster{ +public: + static CCluster *Instance(); + bool AddNames(std::vector Names) { if(!_names.empty()) { return false; }_names = Names; return true; }; + bool AddTree(std::string Tree) { if(_tree.NoSeq() > 0) { return false; } _tree = CTree(Tree,_names); MakePairs(); return true; } + + // Access for the pairs needed for calculation + vector > PairsToCalculate(); // Gets the list of pairs to calculate + + // The output information + // clusterMethods: [0] : mean + std::vector > OutputClusters(vector PPs, double threshold, int clusterMethod = 0); // The result of the clustering + +private: + bool _ready = false; + int NoSeq() { return _names.size(); } + static CCluster * _cluster; + std::vector _names; + int _approxNumber = 10; // The number of pairwise comparisons to make for each split + CTree _tree; + // Values linked to the clustering + void MakePairs(); // Calculates the pairs needed for clustering + std::vector _splits; // The tree splits (not including trivial splits) + std::vector > > _pairs; // The set of pairs examined to assess each split + vector > GetPairs(int splitNum); // Get the pairs for a specific split number + bool TestSplit(int split2Test, double threshold, vector &PPs, int testMethod = 0); // Function that uses PPs to test a specific split + vector > AddSplit(int split2Add, vector > curSplit); // Adds the _split(split2Add) to the current set of splits + +}; diff --git a/Divvier.cpp b/Divvier.cpp new file mode 100644 index 0000000..019262c --- /dev/null +++ b/Divvier.cpp @@ -0,0 +1,201 @@ +/* + * Divvier.cpp + * + * Created on: 24 Oct 2017 + * Author: simon + */ + +// Imported stuff from Zorro and bionj +extern "C" { +#define EXTERN extern + +#include "trim.h" +#include "hmm.h" +#include "utils.h" + +#undef EXTERN + +} + +#include "Divvier.h" +#include "Cluster.h" +#include "Tree.h" +#include + +string BigTree = "((EOG52FR0S|AGAMB_3.6|Bombylius_major|s3305_L_37822_0-179:0.10258365074488022539,((EOG52FR0S|AGAMB_3.6|Trichocera_fuscata|C1011097-176:0.37919204223948976828,EOG52FR0S|AGAMB_3.6|Bibio_marci|s6005_L_37988_0-160:0.21668222714464321910):0.07156413693204413673,((EOG52FR0S|NVITR_1.2|Apachyus_chartaceus|s5108_L_54582_0-173:0.27997964631343663644,EOG52FR0S|TCAST_3.0reorderedheaderfields|Forficula_auricularia|C894882-184:0.35947113088008203485):0.40581948723979877069,((((EOG52FR0S|ZNEVA_2.1|Platycentropus_radiatus|s10078_L_106168_0-182:0.28712730117582346834,EOG52FR0S|BMORI_2.0|Rhyacophila_fasciata|C295776-176:0.20879844376069728318):0.08377470818577073541,(((((EOG52FR0S|BMORI_2.0|BMORI|BGIBMGA013243_TA-192:0.20796019754844122240,(EOG52FR0S|BMORI_2.0|Parides_arcas|s11100_L_89186_0-191:0.15948176694983298707,(EOG52FR0S|BMORI_2.0|Polyommatus_icarus|C448597-191:0.20728969418089865373,EOG52FR0S|BMORI_2.0|Zygaena_fausta|C408340-190:0.31914552697676795701):0.07335145172361670629):0.05053893709918481220):0.05850196533057449438,EOG52FR0S|BMORI_2.0|Yponomeuta_evonymella|C659571-189:0.24661963489412574990):0.04379433474415747596,(EOG52FR0S|BMORI_2.0|Nemophora_deegerella|s9473_L_47391_0-192:0.34388825491603491891,(EOG52FR0S|BMORI_2.0|Eriocrania_subpurpurella|C657810-162:0.28016145828226290959,EOG52FR0S|BMORI_2.0|Triodia_sylvina|s19689_L_226945_0-114:0.25180531327347416282):0.05477461074677487246):0.05850214087699501936):0.27893477336102578956,EOG52FR0S|TCAST_3.0reorderedheaderfields|Hydroptilidae_sp|C371109-171:0.20954375979877706837):0.08227257382311191358,(EOG52FR0S|AGAMB_3.6|Philopotamus_ludificatus|contig11196-112:0.18670733170122680300,EOG52FR0S|AGAMB_3.6|Cheumatopsyche_sp|C346787-143:0.24975404489548827525):0.10840344418213380961):0.03058718575931992922):0.20521928397267710786,(((EOG52FR0S|ZNEVA_2.1|Ceratophyllus_gallinae|C304111-181:0.15468533185857691326,(EOG52FR0S|ZNEVA_2.1|Ctenocephalides_felis|C330557-186:0.06671445089985969523,EOG52FR0S|ZNEVA_2.1|Archaeopsylla_erinacei|contig19363-160:0.07554576633173239186):0.09865183937584097451):0.15539737916237764126,(EOG52FR0S|AGAMB_3.6|Bittacus_pilicornis|C194995-166:0.13838638337865413752,EOG52FR0S|TCAST_3.0reorderedheaderfields|Panorpa_vulgaris|C326395-171:0.19468361452809271328):0.08887788353119491225):0.06652652542145692793,(EOG52FR0S|DMELA_5.40|Nannochorista_sp|contig01540-115:0.31122981729268078821,EOG52FR0S|ZNEVA_2.1|Boreus_hyemalis|s9185_L_107660_0-179:0.20765950329740226477):0.05863533568266944551):0.03902888353479667255):0.03538387372803234593,(((((EOG52FR0S|ZNEVA_2.1|Zorotypus_caudelli|C2490937-177:0.34715128115262866570,(EOG52FR0S|ZNEVA_2.1|Grylloblatta_bifratrilecta|s13116_L_117268_0-182:0.04666403811736503232,EOG52FR0S|ZNEVA_2.1|Galloisiana_yuasai|C897547-182:0.08611999898518697683):0.26871467228598600041):0.03144048514876583017,((((EOG52FR0S|ZNEVA_2.1|Ceuthophilus_sp|C1389194-183:0.24510371778815792654,(EOG52FR0S|ZNEVA_2.1|Acanthocasuarina_muellerianae|C679096-177:0.78333252838140332575,EOG52FR0S|ZNEVA_2.1|Gryllotalpa_sp|s2139_L_10561_0-181:0.30716835037757195259):0.06356106894554112985):0.12891568640354211794,(EOG52FR0S|ZNEVA_2.1|Nilaparvata_lugens|C572437-185:0.30770644845472772122,((EOG52FR0S|ZNEVA_2.1|Timema_cristinae|s15504_L_191927_0-183:0.31245666328246152199,((EOG52FR0S|ZNEVA_2.1|Campodea_augens|s12453_L_61651_1-175:0.64320360852185343159,EOG52FR0S|ZNEVA_2.1|Occasjapyx_japonicus|C446479-170:0.63303573982941852005):0.40917601625516664132,(EOG52FR0S|AGAMB_3.6|Velia_caprai|s10579_L_125610_0-173:0.66117482127068250009,((EOG52FR0S|ZNEVA_2.1|Peruphasma_schultei|s5798_L_35856_0-182:0.11847229589514635117,EOG52FR0S|ZNEVA_2.1|Aretaon_asperrimus|s17907_L_301633_0-182:0.07253143523751401367):0.19903205412866231683,EOG52FR0S|ZNEVA_2.1|Ranatra_linearis|s1330_L_4194_1-169:0.55354515933741088585):0.03262464634031971705):0.04083863261130472183):0.17194594463365361903):0.13632695045330456285,(EOG52FR0S|ZNEVA_2.1|Haploembia_palaui|C892465-181:0.03062609439495739272,EOG52FR0S|ZNEVA_2.1|Aposthonia_japonica|C1112469-182:0.08760650591975023549):0.20538604134669566359):0.06202511515297982891):0.03988691757109619901):0.06753599270869413418,((EOG52FR0S|ZNEVA_2.1|Tetrix_subulata|C601439-182:0.37131693579780711278,(EOG52FR0S|ZNEVA_2.1|Prosarthria_teretrirostris|C772607-153:0.27003519586186774948,EOG52FR0S|ZNEVA_2.1|Stenobothrus_lineatus|C2019176-167:0.31558548049701179439):0.08907778373876351630):0.07790452599305680570,(EOG52FR0S|ZNEVA_2.1|Perla_marginata|C644184-178:0.20904642311162455193,(EOG52FR0S|ZNEVA_2.1|Cosmioperla_kuna|C572547-181:0.28459243504544345926,EOG52FR0S|ZNEVA_2.1|Leuctra_sp|C439181-170:0.23534844734160628721):0.03611738912271651725):0.05315439554639348613):0.00628857273714119279):0.03777628665101530336,((EOG52FR0S|ZNEVA_2.1|Metallyticus_splendidus|s21420_L_361771_0-182:0.08139199083812696800,(EOG52FR0S|ZNEVA_2.1|Empusa_pennata|C1257614-182:0.05831369988802588555,EOG52FR0S|ZNEVA_2.1|Mantis_religiosa|s16763_L_273691_0-182:0.03767170927567057431):0.06640196255228469902):0.22577155249681363225,(EOG52FR0S|ZNEVA_2.1|Blaberus_atropus|s21873_L_207274_0-184:0.14385889001104165685,(EOG52FR0S|ZNEVA_2.1|Cryptocercus_sp|C1038622-184:0.06910321542211660117,((EOG52FR0S|ZNEVA_2.1|Periplaneta_americana|C1181093-183:0.19336899542727936652,EOG52FR0S|ZNEVA_2.1|Mastotermes_darwiniensis|C2113619-184:0.05240389976927931764):0.02280219262590709936,(EOG52FR0S|ZNEVA_2.1|ZNEVA|Znev_01260-184:0.07448774569891650210,EOG52FR0S|ZNEVA_2.1|Prorhinotermes_simplex|s11217_L_143197_0-184:0.06831791999531773574):0.00531771294730884643):0.02772591341670576498):0.13298325015205483823):0.05305481403293376558):0.08813889622810555657):0.01032094283925216564):0.04918804217134938039,((EOG52FR0S|ZNEVA_2.1|Gynaikothrips_ficorum|C875677-171:0.34817770995555730185,(EOG52FR0S|ZNEVA_2.1|Thrips_palmi|C276765-175:0.15906098279260383332,EOG52FR0S|TCAST_3.0reorderedheaderfields|Frankliniella_cephalica|C380329-158:0.21388659633192913523):0.16048147894155415094):0.07653538013830162023,((((EOG52FR0S|ZNEVA_2.1|Cypridininae_sp|DMPC15491392-165:0.52790443634768879910,(EOG52FR0S|PHUMA_1.2|Ectopsocus_briggsi|s15732_L_91826_0-174:0.46448893123991685794,(EOG52FR0S|PHUMA_1.2|Liposcelis_bostrychophila|C576092-177:0.30197120987225656297,EOG52FR0S|PHUMA_1.2|PHUMA|PHUM000230_RA-190:0.12517489564931963408):0.09559427078257826116):0.13402161218995289893):0.06452847926333950268,(EOG52FR0S|ZNEVA_2.1|Thermobia_domestica|C1632884-171:0.28087239716863043881,(EOG52FR0S|AECHI_3.8|Lepeophtheirus_salmonis|347703492-168:2.69374416389131710048,EOG52FR0S|DPULE_2010beta3|DPULE|hxNCBI_GNO_158323-183:0.39413559219271232514):0.26879970332748343020):0.03490067545383494785):0.03655927293058804789,((EOG52FR0S|PHUMA_1.2|Planococcus_citri|C485257-176:0.57727675594194338693,(EOG52FR0S|ZNEVA_2.1|Pogonognathellus_lon_fla|s10192_L_40563_0-168:0.35212892139224716281,((EOG52FR0S|AGAMB_3.6|Tetrodontophora_bielanensis|s4398_L_11904_0-170:0.31253292586648845353,EOG52FR0S|ZNEVA_2.1|Anurida_maritima|s7480_L_41346_1-179:0.53534165142180289987):0.09780428551627755318,(EOG52FR0S|PHUMA_1.2|Folsomia_candida|s13407_L_107771_0-183:0.32730306719440233065,EOG52FR0S|ZNEVA_2.1|Sminthurus_vir_nig|C438893-180:0.44815131648704759071):0.09981773077066337374):0.04633260513791208346):0.14420707809006821920):0.07789073415241687393,(((EOG52FR0S|ISCAP_1.1|ISCAP|ISCW010767_RA-187:0.89808705782006736928,EOG52FR0S|ZNEVA_2.1|Glomeris_pustulata|DMPC15855832-177:0.26687653412383394169):0.13569990008012094984,EOG52FR0S|ZNEVA_2.1|Symphylella_vulgaris|DMPC15369782-167:0.18976574371435883659):0.32321515569392572642,(EOG52FR0S|ZNEVA_2.1|Meinertellus_cundinamarcensis|C1499868-174:0.24727594137049682677,EOG52FR0S|ZNEVA_2.1|Machilis_hrabei|C992353-174:0.27952995969771998741):0.05531855326546314400):0.09807100926610833047):0.00000284652651879708):0.06328495355362576125,(((((EOG52FR0S|DPULE_2010beta3|Ephemera_danica|C517837-183:0.26107317656135214934,EOG52FR0S|DPULE_2010beta3|Eurylophella_sp|s1786_L_4814_0-127:0.22435150151057325907):0.26812301515613495839,EOG52FR0S|ZNEVA_2.1|Isonychia_bicolor|s3662_L_10128_0-182:0.21626403668416263604):0.09224864577838710888,(EOG52FR0S|ZNEVA_2.1|Acerentomon_sp|C729640-170:0.42360344963029672449,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Litopenaeus_vannamei|DMPC7768585-59:0.92986603079020624385,EOG52FR0S|ZNEVA_2.1|Baetis_pumilus|C418006-172:0.24953913442093139663):0.06252057145089949530):0.09415038571919742694):0.12507299422017811863,((EOG52FR0S|ZNEVA_2.1|Speleonectes_tulumensis|DMPC15231531-108:0.45393692433976629008,EOG52FR0S|ZNEVA_2.1|Tricholepidion_gertschi|C1162185-171:0.16794054281747641810):0.05393389030309596321,EOG52FR0S|ZNEVA_2.1|Atelura_formicaria|C1786565-181:0.29696880472858544486):0.01849951439992951474):0.07567091885262067219,((EOG52FR0S|ZNEVA_2.1|Cordulegaster_boltoni|C513443-179:0.08046521448541195387,EOG52FR0S|ZNEVA_2.1|Epiophlebia_superstes|s9066_L_135880_0-171:0.05969989235595130755):0.08201709350185917846,EOG52FR0S|ZNEVA_2.1|Calopteryx_splendens|s6555_L_128950_0-175:0.18384783375122812354):0.19182395594675868966):0.01807997043484640964):0.01795310546278424194):0.06409461969434167294):0.05483711695084447085,(((EOG52FR0S|ZNEVA_2.1|Okanagana_villosa|C1047067-175:0.26389540938051336827,EOG52FR0S|ZNEVA_2.1|Tanzaniophasma_sp|C698355-178:0.34635230878060485615):0.06844350128053054705,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Acanthosoma_haemorrhoidale|s6806_L_55372_0-167:0.41505233401759783485,EOG52FR0S|AGAMB_3.6|Xenophysella_greensladeae|s20205_L_145124_0-178:0.34376572523914900037):0.09259141827950416459):0.10450190963043513859,(((((EOG52FR0S|TCAST_3.0reorderedheaderfields|Xanthostigma_xanthostigma|C566271-164:0.19449936655034971711,EOG52FR0S|PHUMA_1.2|Inocellia_crassicornis|s10337_L_80389_0-154:0.33767557295784456084):0.07652478674593690688,((EOG52FR0S|TCAST_3.0reorderedheaderfields|Osmylus_fulvicephalus|C325502-162:0.32724880512702875235,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Euroleon_nostras|C417689-154:0.22722803678459393972,EOG52FR0S|TCAST_3.0reorderedheaderfields|Dichochrysa_prasina|C865900-157:0.17965260506147759378):0.09226201333368116986):0.04968307450628078881,(EOG52FR0S|AECHI_3.8|Conwentzia_psociformis|s5518_L_8149_2-164:0.90973411417670912993,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Corydalus_cornutus|C168679-171:0.36817967172391224961,EOG52FR0S|TCAST_3.0reorderedheaderfields|Sialis_lutaria|contig05237-171:0.31588037603904894901):0.08098061631711647723):0.02889324885925097230):0.21217546867391573473):0.42075910593195908760,((EOG52FR0S|TCAST_3.0reorderedheaderfields|Priacma_serrata|contig01572-167:0.48448965335818028333,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Aleochara_curtula|C207305-170:0.31950480379657325569,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Dendroctonus_ponderosae|DMPC14949187-172:0.25343465759882993771,(EOG52FR0S|TCAST_3.0reorderedheaderfields|TCAST|TC015202-338:0.39106414960961038974,EOG52FR0S|TCAST_3.0reorderedheaderfields|Meloe_violaceus|s3323_L_15628_0-171:0.32192844068618448050):0.11815592516250769672):0.06869563842199601089):0.02524545323405661193):0.06469464556989516779,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Carabus_granulatus|contig20817-171:0.20936975200398974528,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Lepicerus_sp|s31954_L_190160_0-169:0.30174290416189342157,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Gyrinus_marinus|C426169-130:0.31698713369864911504,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Stylops_melittae|s815_L_3558_4-173:1.09132541448030906395,EOG52FR0S|BMORI_2.0|Mengenilla|contig00812-50:0.24860954514891062117):0.36817374316011364233):0.09639351752230840287):0.12968634453582394972):0.04153894268180059052):0.09005386623017200276):0.07505777768422575158,EOG52FR0S|TCAST_3.0reorderedheaderfields|Cercopis_vulnerata|s2282_L_9869_0-164:0.44816506587907789516):0.04451910789939503288,(EOG52FR0S|TCAST_3.0reorderedheaderfields|Bemisia_tabaci|gi321197821-156:0.18512744296802555177,EOG52FR0S|TCAST_3.0reorderedheaderfields|Trialeurodes_vaporariorum|C822518-167:0.16601394260731558439):0.28381702924840379598):0.06874419825445349241):0.01224687049888137542):0.03621926839709280199,(EOG52FR0S|AECHI_3.8|Cotesia_vestalis|C300420-176:0.35530433915661974176,((EOG52FR0S|NVITR_1.2|NVITR|NV18101_RA-206:0.25865233882925381392,((EOG52FR0S|APISU_v2|Essigella_californica|s21039_L_220779_0-183:0.21768785384887601175,EOG52FR0S|APISU_v2|APISU|ACYPI006767_RA-183:0.00000284652651879708):0.97560016268775173742,EOG52FR0S|NVITR_1.2|Leptopilina_clavipes|C295537-168:0.21594239260245212675):0.05378157473385462156):0.07305057817828163047,(EOG52FR0S|AECHI_3.8|Chrysis_viridula|s10150_L_87082_0-168:0.18506806268268455318,(EOG52FR0S|AECHI_3.8|Tenthredo_koehleri|C329071-193:0.29226817370556612552,((EOG52FR0S|AECHI_3.8|Exoneura_robusta|gi323881566-124:0.16346549139818158936,EOG52FR0S|AMELL_prerelease2|AMELI|GB13139_RA-177:0.25344702471305829983):0.09054987763675816093,(EOG52FR0S|AECHI_3.8|Harp_salt|Hsal_15250-127:0.08760556090348606273,EOG52FR0S|AECHI_3.8|AECHI|Aech_11646-193:0.17994232010852431736):0.08338187287694917571):0.05369313507965074034):0.02027463668964252577):0.10887042098547439206):0.01448637001041356773):0.20399113308697866542):0.03794794241833907011):0.07883567945606387295):0.06444902158795283442):0.07803962599227282082):0.16751648001610505712,(((EOG52FR0S|AGAMB_3.6|Notostira_elongata|C433581-166:0.71271863719522066116,(EOG52FR0S|AGAMB_3.6|AGAMB|AGAP003199_RA-204:0.15983289722980076331,EOG52FR0S|AGAMB_3.6|Aede_aegy|AAEL000163-RA-202:0.16344503318490347099):0.02628487693242425455):0.26383259074490533758,EOG52FR0S|DMELA_5.40|DMELA|3674_CG31229PA-195:0.33554437029363948231):0.15838475752009065212,(EOG52FR0S|DMELA_5.40|Rhagoletis_pomonella|gi257100020-155:0.34539655929732071549,(EOG52FR0S|DMELA_5.40|Sarcophaga_crassipalpis|gi296337228-179:0.08465742730590501697,EOG52FR0S|DMELA_5.40|Triarthria_setipennis|C437403-177:0.14214045150649035065):0.07479398006429255341):0.04071023310863794431):0.07834467316795921954,EOG52FR0S|DMELA_5.40|Lipara_lucens|C642472-194:0.25694883168138404894);"; + +vector allPP; // Global holding the posteriors +vector names; +vector in_seq; + +int main(int argc, char *argv[]) { + + if(argc != 3) { cout << "\n./divvier file threshold"; exit(-1); } + + string fileIn = "ABCE.linsi.fasta"; + string suffixPP = ".PP"; + string suffixDivvy = ".divvy"; + bool skipSingles = false; + string divvy_char = "*"; + double threshold = 0.3; + + fileIn = argv[1]; + threshold = atof(argv[2]); + if(!InRange(threshold,0.0,1.0)) { cout << "\nError: threshold must be in range (0,1)\n"; exit(-1); } + + + // Read in the sequence using Zorro's method and store it in function scope + ReadAndPrepZorro(strdup(fileIn.c_str())); +// cout << "\nSequences of length " << alen; + for(int i = 0; i < Nseq ; i++) { + names.push_back(zorro_names[i]); + in_seq.push_back(zorro_raw_seq[i]); +// cout << "\nName["<AddNames(names); + CCluster::Instance()->AddTree(MakeTree(names,in_seq)); + + // Get the posteriors, either through computation or through HMM calcs + GetPosteriors(fileIn + suffixPP); + + cout << "\nReady to run" << flush; + + vector out_seq(Nseq); + vector PP(Nseq*Nseq,1); + + // Do the divvying and output + for(int i = 0; i < alen; i++) { + // Get the appropriate PPs + for(auto & x : PP) { x = 0; } // Initialise the matrix to zero + for(auto & pp : allPP) { + PP[(Nseq * pp.x()) + pp.y()] = PP[(Nseq * pp.y())+pp.x()] = pp.PP(i); + } + // Do the divvying + vector > divvy = CCluster::Instance()->OutputClusters(PP,threshold); + // Sort output + if(divvy.size() == Nseq && skipSingles) { continue; } // Skip clusters that are fully split +// cout << "\n["<" << names[i] << endl; +// out << "\nLength " << out_seq[i].str().size(); + out << out_seq[i].str() << endl; + } + out.close(); + + cout << "\nDone...\n\n"; + +} + +void GetPosteriors(string File) { + assert(Nseq > 0); + assert(!names.empty() && !in_seq.empty()); + // If the file exists then try reading it + if(file_exist(File)) { + cout << "\nPosterior probability file <" << File << "> exists. \n\tReading file"; + vector Toks; + string tmp; + ifstream in(File); + // Check sequence number + tmp = read_line(in); + Toks = Tokenise(tmp); + if(Toks.size() != 2) { cout << "\nNumber of sequences on first line contains multiple tokens in PP file\n"; exit(-1); } + if(atoi(Toks[0].c_str()) != Nseq) { cout << "\nNumber of sequences in first line of PP file does not match data\n"; exit(-1); } + if(atoi(Toks[1].c_str()) != alen) { cout << "\nLength of alignment in PP file does not match data\n"; exit(-1); } + cout << " ... checking names" << flush; + // Check sequence names + for(int i = 0 ;i < Nseq; i++) { + tmp = read_line(in); + Toks = Tokenise(tmp); + if(Toks.size() != 1) { cout << "\nName of sequence " << i << " has white spaces. This is not allowed\n"; exit(-1); } + if(Toks[0] != names[i]) { cout << "\nName of sequence " << i << " does not match that in data. This is not allowed\n"; exit(-1); } + } + cout << " ... checking PPs" << flush; + // Input PPs + while(getline(in,tmp)) { + allPP.push_back(CPostP(tmp)); + } + in.close(); + cout << " ... success" << flush; + } + // Otherwise calculate the posteriors + else { +// cout << "\nGetting posteriors " << flush; + // Do the calculation + for(auto & x : CCluster::Instance()->PairsToCalculate()) { + getSinglePosterior(x[0],x[1]); +// cout << "\nCalculating (" << x[0] << "," << x[1] << ")" << flush; + allPP.push_back(CPostP(x[0],x[1],zorro_posterior,alen)); +// cout << " ... done"; + } + // Do the output + ofstream out(File.c_str()); + out << Nseq << " " << alen; + for(string &n : names) { out << endl << n; } + for(CPostP &pp : allPP) { out << endl < n, vector s) { + int no_seq = n.size(); + assert(n.size() == s.size()); + vector distances(no_seq * no_seq, 0); + for(int i = 0; i < no_seq; i++) { + for(int j = i+1; j < no_seq; j++) { + distances[(j*no_seq)+i] = distances[(i*no_seq)+j] = AAJCdist(GetPercentDiff(s[i],s[j])); + } + } + return DoBioNJ(distances,n); +} + +double AAJCdist(double p) { + return -(19.0/20.0) * log(1 - ( (20.0/19.0) * p) ); +} + +double GetPercentDiff(string seq1, string seq2) { + int diff = 0, total = 0; + assert(seq1.size() == seq2.size()); + for(int i = 0 ; i < seq1.size(); i++) { + if(IsGap(seq1[i]) || IsGap(seq2[i])) { continue; } + total ++; + if(seq1[i] != seq2[i]) { diff ++; } + } + return (double) diff / (double) total; +} + +bool IsGap(char c) { + string gaps = "-X?"; + for(int j = 0 ; j < gaps.size(); j++) { + if(c == gaps[j]) { return true; } + } + return false; +} + diff --git a/Divvier.h b/Divvier.h new file mode 100644 index 0000000..245ce04 --- /dev/null +++ b/Divvier.h @@ -0,0 +1,61 @@ +/* + * Divvier.h + * + * Created on: 24 Oct 2017 + * Author: simon + */ + +#ifndef DIVVIER_H_ +#define DIVVIER_H_ + +#include +#include +#include +#include "Tree.h" // Mucky dependency, but this has the tools in it + +std::string MakeTree(std::vector Names, std::vector Seqs); +double GetPercentDiff(std::string seq1, std::string seq2); +double AAJCdist(double p); +bool IsGap(char c); + +std::string DoBioNJ(std::vector PWdists, std::vector Names, bool DoNumbers = false); + +void GetPosteriors(std::string File); + +// Class for the posterior probabilities +class CPostP { +public: + CPostP(int X, int Y, double *postP, int len) { + _X = X; _Y = Y; + for(int i = 0; i < len; i++) { + _PP.push_back(my_max(0.0, my_min( 1.0 , postP[i] ))); + } + } + CPostP(std::string input) { // Reads in same format as out + std::vector Toks = Tokenise(input); + if(Toks.size() < 6) { cout << "\nError in CPostP::CPostP(input) -- line needs at least 1 posterior probability: "<< input << endl; exit(-1); } + if(Toks[0][0] != '[' || Toks[4][0] != ']') { cout << "\nSuspect opening to posterior probability line in CPostP::CPostP(input): " << input << endl; exit(-1); } + _X = atoi(Toks[1].c_str()); _Y = atoi(Toks[3].c_str()); + assert(_X >= 0 && _Y >= 0); + for(int i = 5; i < Toks.size() ; i++) { + _PP.push_back(my_max(0.0, my_min( 1.0 , atof(Toks[i].c_str())))); + } + } + std::string out() { + std::stringstream ss; + ss <<"[ " << _X << " , " << _Y <<" ]"; + for(int i = 0 ; i < _PP.size(); i++) { ss<< "\t" << _PP[i]; } + return ss.str(); + } + int x() { return _X; } + int y() { return _Y; } + double PP(int i) { return _PP[i]; } + +private: + + int _X, _Y; // The sequences + std::vector _PP; // The posterior probabilities +}; + + +#endif /* DIVVIER_H_ */ diff --git a/Makefile b/Makefile index cc8b57c..338cdde 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,57 @@ CC=gcc - +CPP=g++ +CPPFLAGS = -O3 -std=c++14 -Wall -Wmissing-prototypes -Wshadow -fmessage-length=0 -msse2 -mfpmath=sse CFLAGS = -O3 -Wall -Wmissing-prototypes -Wshadow #CFLAGS = -g - -#LIB = -lm -lmysqlclient -L/panfs/panfs.gm.berkeley.edu/download/souravc/local/lib/mysql/ +# C Code LIB = -lm OBJS = trim.o utils.o hmm.o matrices.o HDR = trim.h utils.h hmm.h matrices.h INC = -I/usr/local/include - +# CPP code +CPPOBJS = Cluster.o Tree.o Random.o Divvier.o bionj.o +CPPHDR = Cluster.h Tree.h Random.h Divvier.h all : fastZorro -.c.o: $(HDR) - $(CC) $(CFLAGS) $(INC) -c $*.c +# Zorro +trim.o : trim.c trim.h + $(CC) $(CFLAGS) $(INC) -c trim.c + +utils.o : utils.c utils.h + $(CC) $(CFLAGS) $(INC) -c utils.c + +hmm.o : hmm.c hmm.h + $(CC) $(CFLAGS) $(INC) -c hmm.c + +matrices.o : matrices.c matrices.h + $(CC) $(CFLAGS) $(INC) -c matrices.c + +fastZorro : $(CPPOBJS) $(OBJS) $(HDR) + $(CPP) $(CPPFLAGS) $(INC) $(LIB) -o divvier $(OBJS) $(CPPOBJS) + +#Cluster +Cluster.o : Cluster.cpp Cluster.h + $(CPP) $(CPPFLAGS) $(INC) -c Cluster.cpp + +Tree.o : Tree.cpp Tree.h + $(CPP) $(CPPFLAGS) $(INC) -c Tree.cpp + +Random.o : Random.cpp Random.h + $(CPP) $(CPPFLAGS) $(INC) -c Random.cpp + +#bionj +bionj.o : bionj.cxx + $(CPP) $(CPPFLAGS) $(INC) -c bionj.cxx -fastZorro : $(OBJS) $(HDR) - $(CC) $(CFLAGS) $(INC) $(LIB) -o fastZorro $(OBJS) +# Divvier +Divvier.o : Divvier.cpp Divvier.h + $(CPP) $(CPPFLAGS) $(INC) -c Divvier.cpp clean: - rm -f $(OBJS) - rm -f fastZorro + rm -f $(OBJS) $(CPPOBJS) + rm -f divvier rm -f core rm -f *~ rm -f a.out diff --git a/Random.cpp b/Random.cpp new file mode 100644 index 0000000..66f239f --- /dev/null +++ b/Random.cpp @@ -0,0 +1,197 @@ +/* + * Random.cpp + * + * Created on: Oct 20, 2017 + * Author: simon + */ + +/* ------------------------------------------------------------------------- + * This is an ANSI C library for multi-stream random number generation. + * The use of this library is recommended as a replacement for the ANSI C + * rand() and srand() functions, particularly in simulation applications + * where the statistical 'goodness' of the random number generator is + * important. The library supplies 256 streams of random numbers; use + * SelectStream(s) to switch between streams indexed s = 0,1,...,255. + * + * The streams must be initialized. The recommended way to do this is by + * using the function PlantSeeds(x) with the value of x used to initialize + * the default stream and all other streams initialized automatically with + * values dependent on the value of x. The following convention is used + * to initialize the default stream: + * if x > 0 then x is the state + * if x < 0 then the state is obtained from the system clock + * if x = 0 then the state is to be supplied interactively. + * + * The generator used in this library is a so-called 'Lehmer random number + * generator' which returns a pseudo-random number uniformly distributed + * 0.0 and 1.0. The period is (m - 1) where m = 2,147,483,647 and the + * smallest and largest possible values are (1 / m) and 1 - (1 / m) + * respectively. For more details see: + * + * "Random Number Generators: Good Ones Are Hard To Find" + * Steve Park and Keith Miller + * Communications of the ACM, October 1988 + * + * Name : rngs.c (Random Number Generation - Multiple Streams) + * Authors : Steve Park & Dave Geyer + * Language : ANSI C + * Latest Revision : 09-22-98 + * ------------------------------------------------------------------------- + */ + +#include "Random.h" +#include +#include +#include +#include +#include +#include + +#define MODULUS 2147483647 /* DON'T CHANGE THIS VALUE */ +#define MULTIPLIER 48271 /* DON'T CHANGE THIS VALUE */ +#define CHECK 399268537 /* DON'T CHANGE THIS VALUE */ +#define STREAMS 256 /* # of streams, DON'T CHANGE THIS VALUE */ +#define A256 22925 /* jump multiplier, DON'T CHANGE THIS VALUE */ +#define DEFAULT 123456789 /* initial seed, use 0 < DEFAULT < MODULUS */ + +static long seed[STREAMS] = {DEFAULT}; /* current state of each stream */ +static int stream = 0; /* stream index, 0 is the default */ +static int initialized = 0; /* test for stream initialization */ + + + double Random(void) +/* ---------------------------------------------------------------- + * Random returns a pseudo-random real number uniformly distributed + * between 0.0 and 1.0. + * ---------------------------------------------------------------- + */ +{ + const long Q = MODULUS / MULTIPLIER; + const long R = MODULUS % MULTIPLIER; + long t; + + t = MULTIPLIER * (seed[stream] % Q) - R * (seed[stream] / Q); + if (t > 0) + seed[stream] = t; + else + seed[stream] = t + MODULUS; + + return ((double) seed[stream] / MODULUS); +} + + + void PlantSeeds(long x) +/* --------------------------------------------------------------------- + * Use this function to set the state of all the random number generator + * streams by "planting" a sequence of states (seeds), one per stream, + * with all states dictated by the state of the default stream. + * The sequence of planted states is separated one from the next by + * 8,367,782 calls to Random(). + * --------------------------------------------------------------------- + */ +{ + const long Q = MODULUS / A256; + const long R = MODULUS % A256; + int j; + int s; + + initialized = 1; + s = stream; /* remember the current stream */ + SelectStream(0); /* change to stream 0 */ + PutSeed(x); /* set seed[0] */ + stream = s; /* reset the current stream */ + for (j = 1; j < STREAMS; j++) { + x = A256 * (seed[j - 1] % Q) - R * (seed[j - 1] / Q); + if (x > 0) + seed[j] = x; + else + seed[j] = x + MODULUS; + } +} + + + void PutSeed(long x) +/* --------------------------------------------------------------- + * Use this function to set the state of the current random number + * generator stream according to the following conventions: + * if x > 0 then x is the state (unless too large) + * if x < 0 then the state is obtained from the system clock + * if x = 0 then the state is to be supplied interactively + * --------------------------------------------------------------- + */ +{ + char ok = 0; + + if (x > 0) + x = x % MODULUS; /* correct if x is too large */ + if (x < 0) + x = ((unsigned long) time((time_t *) NULL)) % MODULUS; + if (x == 0) + while (!ok) { + printf("\nEnter a positive integer seed (9 digits or less) >> "); + scanf("%ld", &x); + ok = (0 < x) && (x < MODULUS); + if (!ok) + printf("\nInput out of range ... try again\n"); + } + seed[stream] = x; +} + + + void GetSeed(long *x) +/* --------------------------------------------------------------- + * Use this function to get the state of the current random number + * generator stream. + * --------------------------------------------------------------- + */ +{ + *x = seed[stream]; +} + + + void SelectStream(int index) +/* ------------------------------------------------------------------ + * Use this function to set the current random number generator + * stream -- that stream from which the next random number will come. + * ------------------------------------------------------------------ + */ +{ + stream = ((unsigned int) index) % STREAMS; + if ((initialized == 0) && (stream != 0)) /* protect against */ + PlantSeeds(DEFAULT); /* un-initialized streams */ +} + + + void TestRandom(void) +/* ------------------------------------------------------------------ + * Use this (optional) function to test for a correct implementation. + * ------------------------------------------------------------------ + */ +{ + long i; + long x; + double u; + char ok = 0; + + SelectStream(0); /* select the default stream */ + PutSeed(1); /* and set the state to 1 */ + for(i = 0; i < 10000; i++) + u = Random(); + GetSeed(&x); /* get the new state value */ + ok = (x == CHECK); /* and check for correctness */ + + SelectStream(1); /* select stream 1 */ + PlantSeeds(1); /* set the state of all streams */ + GetSeed(&x); /* get the state of stream 1 */ + ok = ok && (x == A256); /* x should be the jump multiplier */ + if (ok) + printf("\n The implementation of rngs.c is correct.\n\n"); + else + printf("\n\a ERROR -- the implementation of rngs.c is not correct.\n\n"); +} + +int RandInt(int Begin, int End) { + return (int) (Begin + ( (End - Begin) * Random() ) ); +} + + diff --git a/Random.h b/Random.h new file mode 100644 index 0000000..32eac32 --- /dev/null +++ b/Random.h @@ -0,0 +1,43 @@ +/* + * Random.h + * + * Created on: Oct 20, 2017 + * Author: simon + */ + +#ifndef RANDOM_H_ +#define RANDOM_H_ + +/* ----------------------------------------------------------------------- + * Name : rngs.h (header file for the library file rngs.c) + * Author : Steve Park & Dave Geyer + * Language : ANSI C + * Latest Revision : 09-22-98 + * ----------------------------------------------------------------------- + */ + +#if !defined( _RNGS_ ) +#define _RNGS_ + +#include +#include +#include +#include +#include +#include +#include + +double Random(void); +void PlantSeeds(long x); +void GetSeed(long *x); +void PutSeed(long x); +void SelectStream(int index); +void TestRandom(void); + +#endif + +int RandInt(int begin,int end); +inline double RandDouble(double Low, double High) { return Low + (fabs(High - Low) * Random()); } + + +#endif /* RANDOM_H_ */ diff --git a/Tree.cpp b/Tree.cpp new file mode 100644 index 0000000..3b21e44 --- /dev/null +++ b/Tree.cpp @@ -0,0 +1,1990 @@ +// Code for classes CTree and CNode +///////////////////////////////////////////// +// WARNING: This code has been heavily tampered with +// Some of the entry and exit condition of CTree function are *NOT* tested + + +// Include files +#include "Tree.h" +#define TREE_DEBUG 0 + + +#if DO_MEMORY_CHECK +extern CMemChecker memory_check; +#endif + +// Firstly describe CNode +///////////////////////////////////////////////// + +// General constructor +//////////////////////////////////////////// +CNode::CNode(int NoLinks,int *LinkList) { +#if DO_MEMORY_CHECK + memory_check.CountCNode++; +#endif + SetNode(NoLinks,LinkList); } +CNode::CNode(int la, int lb, int lc, int ba, int bb, int bc,int IntVal) { +#if DO_MEMORY_CHECK + memory_check.CountCNode++; +#endif +SetNode(la,lb,lc,ba,bb,bc,IntVal); } +CNode::CNode(int la, int lb, int IntVal) { +#if DO_MEMORY_CHECK + memory_check.CountCNode++; +#endif +SetNode(la,lb,IntVal); } +CNode::CNode(vector L, vector B, int IntVal) { +#if DO_MEMORY_CHECK + memory_check.CountCNode++; +#endif + assert(L.size() == B.size()); + switch(L.size()) { + case 1: SetNode(L[0],B[0],IntVal); break; + case 3: SetNode(L[0],L[1],L[2],B[0],B[1],B[2],IntVal); break; + default: Error("Unknown type of node in constructor..."); + } +} +CNode::CNode(const CNode &Node) {int i; +#if DO_MEMORY_CHECK + memory_check.CountCNode++; +#endif + CleanNode(); + m_NodeType = Node.m_NodeType; + FOR(i,(int)Node.m_viBranch.size()) { m_viBranch.push_back(Node.m_viBranch[i]); } + FOR(i,(int)Node.m_viLink.size()) { m_viLink.push_back(Node.m_viLink[i]); } +} +// Destructor function +/////////////////////////////////////////// +CNode::~CNode() { +#if DO_MEMORY_CHECK + memory_check.CountCNode--; +#endif + CleanNode(); } + +// SetNode functions for bifurcating trees +// These are appallingly verbose, but work... +/////////////////////////////////////////////// +void CNode::SetNode(int la,int lb, int IntVal) { // Leaf node + CleanNode(); + m_NodeType = leaf; + m_viLink.push_back(la); + m_viBranch.push_back(lb); + m_iInternalNodeNum = IntVal; +} + +// Internal node +void CNode::SetNode(int la, int lb, int lc, int ba, int bb, int bc, int IntVal) { + CleanNode(); + m_NodeType = branch; + m_viLink.push_back(la); m_viLink.push_back(lb); m_viLink.push_back(lc); + m_viBranch.push_back(ba); m_viBranch.push_back(bb); m_viBranch.push_back(bc); + m_iInternalNodeNum = IntVal; +} + +// Root node +void CNode::SetNode(int la, int lb, int ba, int bb, int IntVal) { + CleanNode(); + m_NodeType = root; + m_viLink.push_back(la); m_viLink.push_back(lb); + m_viBranch.push_back(ba); m_viBranch.push_back(bb); + m_iInternalNodeNum = IntVal; +} + +/////////////// General links creation function //////////////////// +//////////////////////////////////////////////////////////////////// + +void CNode::SetNode(int NoLinks, int *LinkList) { + int i; + CleanNode(); + // Specify leaf node + if(NoLinks < 1) { + m_NodeType = leaf; + } else if(NoLinks == 2) { + FOR(i,NoLinks) { m_viLink.push_back(LinkList[i]); } + m_NodeType = root; + } else { // Specify branch node + FOR(i,NoLinks) { m_viLink.push_back(LinkList[i]); } + m_NodeType = branch; + } +} + +// Clean Node +//////////////////////////////////////////////////////// +void CNode::CleanNode() { + m_iInternalNodeNum = -1; + m_viBranch.clear(); + m_viLink.clear(); +} + +// Stream operators for CNode +/////////////////////////////////////////// +ostream& operator<<(ostream& os, const CNode &Node) { + int i; + switch(Node.m_NodeType) { + case branch: os << "\nInternal node "; break; + case leaf: os << "\nExternal node "; break; + case root: os << "\nRoot node "; break; + default: os << "\n\nUnknown node type operator<<:(\n\n"; exit(-1); + } + if(Node.m_iInternalNodeNum >= 0) { os << "[ "< Names, bool AllowFail, bool AllowSubTree) { +#if DO_MEMORY_CHECK + memory_check.CountCTree++; +#endif + m_iRootNode = -1; m_bRooted = false; m_bValid = false; m_ariBraLinks = NULL; CreateTree(TREE,Names,true,AllowFail,AllowSubTree); } // Basic constructor + +int IsTreeGap(char Check) { + if(Check!='('&&Check!=')'&&Check!=';'&&Check!=','&&Check!=':') { return 1; } + return 0; +} + +// CTree copy constructor +///////////////////////////////////////////////////// +CTree::CTree(const CTree &Tree) { +#if DO_MEMORY_CHECK + memory_check.CountCTree++; +#endif + m_bValid = false; + m_Node = NULL; m_bReady = false ;m_iOutBra=false; m_ariBraLinks = NULL; m_bOldTree = Tree.m_bOldTree; + m_iNoNode = m_iNoBra = 0; + *this = Tree; +} + +// Underlying Constructor function +///////////////////////////////////////////////////// +// Takes two arguments: char tree[], and int NoSeq +// tree[] is the tree is bracket form eg. ((1,2)3,4) +// NOTE1: The nnumber of brackets determines whether the tree is classified as rooted +// or unrooted. Brackets == NoSeq-1 => rooted, Brackets == NoSeq-2 => unrooted +// NOTE2: The two species case is a special case. When two sequences are observed the tree +// is input in the form of a rooted tree. +// NOTE3: !!!TODO:!!! This code expects reversible models to transform rooted tree to unrooted tree +// for efficient calculation +// NOTE4: This routine is wasteful of space as it takes an extra node rooted trees regardless of their shape +// NoSeq is the number of leaf nodes in the tree +///////////////////////////////////////////////////// +void CTree::CreateTree(string Tree, vector Names, bool CheckVar, bool AllowFail,bool AllowSubTree) { + int i,j,pRight=0,pLeft=0,NextPointer=0,Parent, IntVal = -1, mem_seq, countBra, SubSeq; + int NoSeq = Names.size(); // Get the number of sequences from the names provide + double TempBranch[3]; // Stores branch lengths + int TempLabel[3]; // Stores branch labels + bool *CheckAlloc=NULL; // Checks node allocation + string TempTree; + vector Toks; + list temp_l; // Space for reorganising branch labels + vector temp_l2; // More space + m_vsName = Names; + // Do some checks on the string Tree + // 1. Check ends in ';' + if(Tree.find(";",(int)Tree.size()-1) == string::npos) { Tree += ";"; } + // Count the '(' & ')' to calculate # of sequences and remove all spaces + pLeft = pRight = j = 0; + FOR(i,(int)Tree.size() - 1) { + if(Tree[i] == '(') { pLeft++;} if(Tree[i] == ')') { pRight ++; } // Count number of parentheses + if((Tree[i] == '(' || Tree[i] == ',') && isalnum(Tree[i+1]) != 0) { j++; } // Count hte number of species + } + + //////////////////////////////////////////////////////////////// + // Do some error checking and check the number of species + // 1. Left and Right parentheses + if(pRight != pLeft) { + cout << "\nUnequal number of pLeft(="< 0 && j != NoSeq && AllowSubTree == false) { + if(CheckVar) { Error("\nDisagreement between number of species in file header ("+int_to_string(NoSeq)+") and number of species in file ("+int_to_string(j)+")"); } + else { + // Make sure difference between input tree and space reserved + mem_seq = NoSeq; NoSeq = j; + } + } else { + if(AllowSubTree) { assert(NoSeq > j); mem_seq = NoSeq; SubSeq = j; } + else { mem_seq = NoSeq = SubSeq = j; } + } + // 3. Number of internal nodes is correct + if(NoSeq == 2) { m_bRooted = false; } + else if(pRight!=(NoSeq-1) && NoSeq>2) { // For unrooted trees #internal nodes = #seq - 2 + m_bRooted = false; + } else if(pRight!=(NoSeq-2) && NoSeq>2) { // For rooted trees #internal nodes = #seq - 1 + m_bRooted = true; + } else { + cout << "\nIncorrect number of nodes for sequence number("<2) { // Prepare first bit of trNextPointeree if more than two species + // Loop to allocate nodes pLeft, pRight, Next are the old child0 child1 and parent nodes + // these are allocated to m_Node->link[0], [1] and [2] respectively This is used to some effect + // when allocated branches and so on be careful not to screw it up + if(m_bRooted) { countBra = SubSeq-1; } else { countBra = SubSeq-3; } // Note SubSeq used to allow subtrees + // Note the last node is done in this routine for rooted trees and finalised in the next section + for(NextPointer = m_iNoSeq, i=0;iSetNode(NextPointer,pLeft,-1); + } else { + assert(m_Node[pLeft]->m_viLink.size() > 2 && m_Node[pLeft]->m_viBranch.size() > 2); +// cout << "\n\tDoing Left["<m_viLink[2]<<"]: " << NextPointer; + m_Node[pLeft]->m_viLink[2] = NextPointer; + m_Node[pLeft]->m_viBranch[2] = pLeft; + } + SetB(pLeft,TempBranch[0],true,true); m_viBranchLabels[pLeft] = TempLabel[0]; + if(pRight < m_iNoSeq) { + if(CheckAlloc[pRight] != false) { cout << "\nNode " << pRight << " multiply allocated.\nUsually caused by same sequence occuring multiple times in tree\n\n"; if(AllowFail) { return; } Error(); } + CheckAlloc[pRight] = true; + m_Node[pRight]->SetNode(NextPointer,pRight, -1); + } else { + assert(m_Node[pRight]->m_viLink.size() > 2 && m_Node[pRight]->m_viBranch.size() > 2); +// cout << "\n\tDoing Right["<m_viLink[2]<<"]: " << NextPointer; + m_Node[pRight]->m_viLink[2] = NextPointer; + m_Node[pRight]->m_viBranch[2] = pRight; } + SetB(pRight,TempBranch[1],true,true); m_viBranchLabels[pRight] = TempLabel[1]; + // Initialise the new Node + if(CheckAlloc[NextPointer] != false) { cout << "\nNode " << NextPointer << " multiply allocated.\nUsually caused by same sequence occuring multiple times in tree\n\n"; if(AllowFail) { return; } Error(); } + CheckAlloc[NextPointer] = true; + // Create an internal node +// cout << "\n\tAllocating internal node["<SetNode(NextPointer,j, -1); + } + // Assign the branch lengths + SetB(j,TempBranch[i],true,true); m_viBranchLabels[j] = TempLabel[i]; + switch(i) { + case 0: pLeft = j; break; + case 1: pRight = j; break; + case 2: Parent = j; break; + default: cout << "\nError assigning branch lengths...\n\n"; if(AllowFail) { return; } Error(); + } } + if(m_Node[NextPointer] != NULL) { delete m_Node[NextPointer]; } + m_Node[NextPointer] = new CNode(pLeft,pRight,Parent,pLeft,pRight,Parent,IntVal); // Initialise the final node + if(m_Node[pLeft]->m_viLink.size() > 2) { m_Node[pLeft]->m_viLink[2] = NextPointer; } + if(m_Node[pRight]->m_viLink.size() > 2) { m_Node[pRight]->m_viLink[2] = NextPointer; } + if(m_Node[Parent]->m_viLink.size() > 2) { m_Node[Parent]->m_viLink[2] = NextPointer; } +// cout << "\nDone!" << flush; + } else { + // Do the rooted tree by adjusting the root node. + m_iRootNode = NextPointer -1; + } + } else { // Otherwise for two species + if(SubSeq == m_iNoSeq) { + if(m_Node[0] != NULL) { delete m_Node[0]; } + m_Node[0] = new CNode(1,0); m_Node[1] = new CNode(0,0); + //cout << "\nTempTree: " << TempTree << " cf. " << TempBranch << flush; + i=1; while(isdigit(TempTree[i])) { i++; } // Do first side + SetB(0,DoBranch(&TempTree,&i,&IntVal),true,true); + while(TempTree[i]!=':' && TempTree[i] != ';') { i++; } + if(TempTree[i] == ':') { AddB(0,DoBranch(&TempTree,&i,&IntVal),true,true); } // Get branch2 + } + else { + TempBranch[0] = TempBranch[1] = TempBranch[2] = 0.0; + find_closest(&TempTree,&pLeft,&pRight,&Parent,NextPointer, TempBranch, TempLabel,&IntVal); + if(m_Node[pLeft] != NULL) { delete m_Node[pLeft]; } + m_Node[pLeft] = new CNode(pRight,0); m_Node[pRight] = new CNode(pLeft,0); + SetB(0,TempBranch[0] + TempBranch[1],true,true); + ReplaceBraLink(0,0,pRight); ReplaceBraLink(0,1,pLeft); + } + } + FOR(i,NoNode()) { if(CheckAlloc[i] == false) { break; } } // Check if all nodes assigned + if(i!=NoNode()) { CheckVar = false; FOR(i,m_iNoSeq) { if(NoLinks(i) > 0) { m_iStartCalc = i; break; } } } + BuildBraLinks(CheckVar); // Get the list of branches links + OrderNode(-1,true); + m_bReady=true; // Tree is now ready!!! + // Do some branch by branch things + m_iNoOptBra = m_iNoBra; + DEL_MEM(CheckAlloc); + ValidateTree(); + m_bValid = true; + // Remove branch labels if not used + for(i=1;i<(int)m_viBranchLabels.size();i++) { if(m_viBranchLabels[i] != m_viBranchLabels[0]) { break; } } + if(i == m_viBranchLabels.size()) { m_viBranchLabels.clear(); } + // Sort branch labels so sensibly labelled + temp_l.assign(m_viBranchLabels.begin(),m_viBranchLabels.end()); + temp_l.sort(); temp_l.unique(); + temp_l2.assign(temp_l.begin(),temp_l.end()); + FOR(i,(int)m_viBranchLabels.size()) { FOR(j,(int)temp_l2.size()) { if(temp_l2[j] == m_viBranchLabels[i]) { m_viBranchLabels[i] = j; break; } } } + m_iNoBraLabels = (int)temp_l2.size(); +} + +// Create the branch links. Also works for sub trees +void CTree::BuildBraLinks(bool Verify) { + int i,j,k; + int *BranCount; + // Initialise + GET_MEM(BranCount,int,m_iNoBra); + if(m_ariBraLinks != NULL) { FOR(i,m_iNoBra) { delete m_ariBraLinks[i]; } delete [] m_ariBraLinks; m_ariBraLinks = NULL; } + m_ariBraLinks = new int*[m_iNoBra]; + FOR(i,m_iNoBra) { m_ariBraLinks[i] = new int[2]; } + FOR(i,m_iNoBra) { + BranCount[i] = 0; + m_ariBraLinks[i][0] = m_ariBraLinks[i][1] = -1; + } + // Get assignments + FOR(i,m_iNoNode) { + FOR(j,(int)m_Node[i]->m_viBranch.size()) { + k = m_Node[i]->m_viBranch[j]; + if(k == -1) { continue; } + assert(k < m_iNoBra && k >= 0); + if(m_ariBraLinks[k][0] == -1) { m_ariBraLinks[k][0] = i; BranCount[k]++;} + else { m_ariBraLinks[k][1] = i; BranCount[k]++;} + } } + // Verify assignments + if(Verify == true) { + FOR(i,m_iNoBra) { + if(BranCount[i] != 2) { cout << "\nError in BranCount["<m_viLink.size()) { BuildBranches(m_Node[Node]->m_viLink[i],Node); } + return; + } + // PostOrder Tree-Traversal + FOR(i,(int)m_Node[NT]->m_viLink.size()) { + if(m_Node[NT]->m_viLink[i] == NF || m_Node[NT]->m_viLink[i] == -1) { m_Node[NT]->m_viBranch[i] = NT; continue; } + BuildBranches(m_Node[NT]->m_viLink[i],NT); + } + FOR(i,(int)m_Node[NF]->m_viLink.size()) { + if(m_Node[NF]->m_viLink[i] == NT) { m_Node[NF]->m_viBranch[i] = NT; break; } + } + +} + +// Find branch linking nodes i and j; returns -1 if doesn't exist +int CTree::FindBra(int N_i, int N_j) { + int i; + FOR(i,m_iNoBra) { + if( (m_ariBraLinks[i][0] == N_i && m_ariBraLinks[i][1] == N_j) || + (m_ariBraLinks[i][0] == N_j && m_ariBraLinks[i][1] == N_i) ) { return i; } + } + return -1; +} + +// Create a set of labels for branches corresponding to their numbers +void CTree::CreateBranchLabels(bool Force) { + int i; + if(!Force) { FOR(i,(int)m_viBranchLabels.size()) { if(m_viBranchLabels[i] != m_viBranchLabels[0]) { Error("\nTrying to CTree::CreateBranchLabels when they already exist\n\n"); } } } + if(m_viBranchLabels.empty()) { m_viBranchLabels.assign(m_iNoBra,0); } + FOR(i,(int)m_viBranchLabels.size()) { m_viBranchLabels[i] = i; } + m_iNoBraLabels = m_viBranchLabels.size(); +} + +// Destructor function +/////////////////////////////////////////////////// +CTree::~CTree() { +#if DO_MEMORY_CHECK + memory_check.CountCTree--; +#endif + CleanTree(); +} + +// Allocate the memory +void CTree::GetMemory(int NoSeq) { + string Name; + assert(m_bReady == false); + // Adding an extra node to all trees because tree may be rooted + int i; + assert(m_Node == NULL && m_ariBraLinks == NULL && m_vdBra.empty()); + if(m_bRooted) { m_iNoNode = (2*NoSeq)-1; m_iNoBra = (2*NoSeq)-2; m_iNoSeq = NoSeq; } + else { m_iNoNode = (2*NoSeq)-2; m_iNoBra = (2*NoSeq)-3; m_iNoSeq = NoSeq; } + GET_MEM(m_ariBraLinks,int*,m_iNoBra); + for(i=0;i CTree::Branches() { + int i; + vector Br; + FOR(i,NoBra()) { Br.push_back(B(i)); } + return Br; +} +double CTree::TreeLength() { + vector Temp = Branches(); + return Sum(&Temp); +} +double CTree::B(int Branch) { + assert(InRange(Branch,0,(int)m_vdBra.size())); + return m_vdBra[Branch]; +} +double CTree::SetB(int Branch, double Value, bool Update, bool Rescale) { + assert(InRange(Branch,-1,(int)m_vdBra.size())); + m_vdBra[Branch] = Value; + return B(Branch); +} +double CTree::MulB(int Branch, double Value, bool Update, bool Rescale) { + assert(InRange(Branch,0,(int)m_vdBra.size())); + return SetB(Branch,Value * B(Branch), Update, Rescale); ; +} +double CTree::AddB(int Branch, double Value, bool Update, bool Rescale) { + assert(InRange(Branch,0,(int)m_vdBra.size())); + return SetB(Branch, Value + B(Branch), Update, Rescale); +} + +double CTree::QuadB(int Branch) { + int i,j,NoBra = 1; + double TotalBra = B(Branch); // Gets branch length (and checks the branch is valid + FOR(i,2) { + FOR(j,(int)m_Node[m_ariBraLinks[Branch][i]]->m_viLink.size()) { + if(NodeBra(m_ariBraLinks[Branch][i],j) == -1 || NodeLink(m_ariBraLinks[Branch][i],j) == -1) { return -BIG_NUMBER; } + if(NodeBra(m_ariBraLinks[Branch][i],j) == Branch) { continue; } + NoBra++; + TotalBra += B(m_Node[m_ariBraLinks[Branch][i]]->m_viBranch[j]); + } } + return TotalBra / (double) NoBra; +} + +void CTree::ReplaceNodeLink(int N,vector L) { + assert(InRange(N,0,NoNode())); + m_Node[N]->m_viLink = L; + m_bFastCalcOK = false; +} +void CTree::ReplaceNodeLinkElement(int N, int Element, int Val) { + assert(InRange(N,0,NoNode())); + assert(InRange(Element,0,NoLinks(N))); + m_Node[N]->m_viLink[Element] = Val; + m_bFastCalcOK = false; +} +void CTree::ReplaceNodeBra(int N,vector B) { + assert(InRange(N,0,NoNode())); + m_Node[N]->m_viBranch = B; + m_bFastCalcOK = false; +} +void CTree::ReplaceNodeBraElement(int N, int Element, int Val) { + assert(InRange(N,0,NoNode())); + assert(InRange(Element,0,NoLinks(N)) && NoLinks(N) == m_Node[N]->m_viBranch.size()); + m_Node[N]->m_viBranch[Element] = Val; + m_bFastCalcOK = false; +} + +// Tree operator function +//////////////////////////////////////////////// + +void CTree::operator=(const CTree & Tree) { + int i; + if(!Tree.m_bReady || &Tree == this) { return; } + CleanTree(); // Clear memory + m_bRooted = Tree.m_bRooted; m_iRootNode = Tree.m_iRootNode; + GetMemory(Tree.m_iNoSeq); // New memory + // Transfer data + for(i=0;ifind_first_of("("); x2 = tree->find("(",x+1); y = tree->find_first_of(")") + 1; + // Find the matching parentheses + while(y > x2) { + x = x2; x2 = tree->find("(",x+1); + if(x2 == string::npos) { x2 = (int)tree->size();} + } + // Get the stuff to resolve and replace it in the tree + resolve = tree->substr(x,y-x); + tree->replace(x,y-x,int_to_string(n_p +1)); + // Get the required values + x2 = resolve.find_first_of(","); + // 1. Do left hand side + GetBraDetails(resolve.substr(resolve.find_first_of("(")+1,x2-1),c1,&bra[0],&label[0]); + // 2. Do right hand side + GetBraDetails(resolve.substr(x2+1),c2,&bra[1],&label[1]); +// cout << "\nc1 = " << *c1 << ", c2 = " << *c2 << ", m_iNoNode: " << m_iNoNode; +// cout << "\nNew tree: " << *tree; + // Reset for next part of loop + *p=n_p; + assert(InRange(*c1,0,m_iNoNode) && InRange(*c2,0,m_iNoNode)); +} + +// Function for getting branch details from a string of form Number:length#label +// Assumes bra and node are correctly initialised +void CTree::GetBraDetails(string s, int *node, double *bra, int *label) { + int c,d,i,j; + string sub; + *label = 0; *bra = SET_BRANCH; + // Organise '(' and ')' in strings; strictly one of each allowed + i = s.find("("); j = s.find(")"); + if(i!= string::npos) { s.replace(i,1,""); } i = s.find("("); if(j!= string::npos) { s.replace(j,1,""); } + i = s.find("("); j = s.find(")"); + if(i != string::npos || j != string::npos) { Error("\nCTree::GetBraDetails(...)\nTrying to resolve string with more than one parenthesis: " + s + " \n"); } +// cout << "\nGetBraDetails: " << s; + i = s.find(":"); if(i==string::npos) { i = s.size(); } + j = s.find(":"); if(j==string::npos) { j = s.size(); } + *node = atoi(s.substr(0,min(i,j)).c_str()) - TREE_START; + for(c=min(i,j);c<(int)s.size();c++) { + if(s[c] == ':') { + for(d=c+1;d<(int)s.size();d++) { + if(!(isdigit(s[d]) || s[d] == '.' || toupper(s[d]) == 'E' || s[d] == '-')) { break; } } + *bra = atof(s.substr(c+1,d-c-1).c_str()); + } else if (s[c] == '#') { + for(d=c+1;d<(int)s.size();d++) { if(!(isdigit(s[d]) || s[d] == '.')) { break; } } + *label = atoi(s.substr(c+1,d-c-1).c_str()); + } + } +// cout << "\nNode: " << *node << " bra: " << *bra << " label: " << *label; +} + +double CTree::DoBranch(string *resolve,int *pos, int *IntVal) { + int i = *pos,j; + double ret_val = SET_BRANCH; + // If there is obviously no branch return + if(i+1 >= (int)resolve->size()) { return ret_val; } + if(resolve->at(i) ==':') { + i++; + ret_val=atof(resolve->c_str()+i); + // Skips the digits involved for this + while((resolve->at(i)=='-')||(isdigit(resolve->at(i))) ||(resolve->at(i)=='.')||(resolve->at(i)=='e')) { i++;} + } + // Get the IntVal if it exists + // Requirement is that it falls between ')' and branch length's ':' + if(resolve->at(i) ==')' && resolve->at(i+1) != ':' && i+1 < (int)resolve->size()) { + j = i; i++; + while(resolve->at(i) != ':' && resolve->at(i) != ',' && resolve->at(i) != ')' && resolve->at(i) != '(' && resolve->at(i) != ';') { i++; } + if(resolve->at(i)==':' || resolve->at(i) == ';') { *IntVal = atoi(resolve->c_str()+j+1); } + i = j; + } + *pos = i; + if(ret_val < 0) { ret_val = 0; } + return ret_val; +} + +// Removes a node from a tree that has already had one of its links removed +int CTree::RemoveBraNode(int N) { + int i,j, count; + double BVal; + vector Link, Bra; + // Initialise and check entry conditions + assert(N >= NoSeq()); + count = 0; + FOR(i,NoLinks(N)) { + if(NodeLink(N,i) == -1) { count++; continue; } + Link.push_back(NodeLink(N,i)); Bra.push_back(NodeBra(N,i)); + ReplaceNodeLinkElement(N,i,-1); ReplaceNodeBraElement(N,i,-1); + } + assert(count == 1 && Link.size() == 2 && Bra.size() == 2); + // Do the branch lengths + BVal = B(Bra[0])+B(Bra[1]); + FOR(i,2) { SetB(Bra[i],BVal,true,true); } + // Now remove the nodes + FOR(i,2) { + FOR(j,NoLinks(Link[i])) { if(NodeLink(Link[i],j) == N) { break; } } + assert(j != NoLinks(Link[i])); + ReplaceNodeLinkElement(Link[i],j,Link[FlipBin(i)]); + ReplaceNodeBraElement(Link[i],j,Bra[0]); + m_ariBraLinks[Bra[0]][i] = Link[i]; + } + OrderNode(Link[0]); OrderNode(Link[1]); + m_ariBraLinks[Bra[1]][0] = m_ariBraLinks[Bra[1]][1] = -1; + return Bra[0]; +} + +// Cuts a branch +// returns the branch on priority subtree where they used to be attached -- i.e. the blank node +// -- Subtrees are maintained; Link specifies which one is given priority +int CTree::CutBranch(int B, int Link) { + int i,j,N,F, RetVal = -1; + // Remove facing nodes + FOR(i,2) { + N = BraLink(B,i); // Get node to + F = BraLink(B,FlipBin(i)); + FOR(j,NoLinks(N)) { if(NodeLink(N,j) == F) { break; } } + assert(j!=NoLinks(N)); + m_Node[N]->m_viLink[j] = -1; m_Node[N]->m_viBranch[j] = -1; + if(i != Link) { RetVal = RemoveBraNode(N); } + else { OrderNode(N); } + } + // Clean up the branch + m_ariBraLinks[B][0] = m_ariBraLinks[B][1] = -1; + return RetVal; +} + +// Cut Sequence function: removes a sequence from the tree +/////////////////////////////////////////////////////////////// +// Needs to store: +// 2 x branch numbers (leading to leaf node; trivial + spare branch) +// 1 x node number (node removed from tree) +// This will be held in the node (which is ready for addition to tree) +// +// Return value: +// Spare node number +// +// Assumptions: +// The branch leading to a leaf node has the # of the leaf node + +int CTree::RemoveLeafNode(int RemNode) { + /* Removes a Node from the tree + returns 0 if all okay, else 1 */ + int i,j,k,m,links[2],base_node;double total; + // Error checking for value passed + if(m_iNoSeq < 3) { + cout << "\nAttempted to remove a sequence when only two are left"; + return 1; + } + if(RemNode >= m_iNoSeq) { + cout << "\nAttempted to remove an internal node"; + return 1; + } + // Section to remove the corresponding section from the tree + // Deal with when removing a sequence from a three species tree + if(m_iNoSeq == 3) { + if(RemNode == 0) { m_Node[0] = m_Node[2]; total = B(1) + B(2); } + else if(RemNode == 1) { m_Node[1] = m_Node[2]; total = B(0) + B(2); } + else { total = B(0) + B(1); } + m_Node[0]->m_viLink[0] = 1; m_Node[0]->m_viBranch[0] = 0; + m_Node[1]->m_viLink[1] = 0; m_Node[0]->m_viBranch[0] = 0; + m_ariBraLinks[0][0] = 0; m_ariBraLinks[0][1] = 1; + SetB(0,total,true,true); + m_iNoSeq = 2; + m_bFastCalcOK = false; + return 0; + } + // Remove a sequence from the tree + // Get base_node and links + base_node = m_Node[RemNode]->m_viLink[0]; + assert(m_Node[base_node]->m_NodeType == branch); j=0; + for(total=0.0,i=0;i<3;i++) { + if(m_Node[base_node]->m_viLink[i] == RemNode) { continue; } + links[j++] = m_Node[base_node]->m_viLink[i]; + if(j==2) { // Set the disconnected branch link to null + FOR(m,2) { + if(m_ariBraLinks[ m_Node[base_node]->m_viBranch[i] ][m] != links[1]) { m_ariBraLinks[ m_Node[base_node]->m_viBranch[i] ][m] = -1; } + } } + total += B(m_Node[base_node]->m_viBranch[i]); + } + if(links[0] > links[1]) { i = links[0]; links[0] = links[1]; links[1] = i; } + // Reassign the correct links and so on + // Important do not change iOption(sequence node removed) or base_node(node removed) + k = 0; // the altered nodes will point to + for(k=-1,i=0;i<2;i++) { + if(m_Node[links[i]]->m_NodeType == branch) { + for(j=0;j<3;j++) { + if(m_Node[links[i]]->m_viLink[j] == base_node) { // Do rearrangements + if(i == 0) { + m_Node[links[0]]->m_viLink[j] = links[1]; + k = m_Node[links[0]]->m_viBranch[j]; + SetB(k,total,true,true); + FOR(m,2) { // Reassign branch links + if(m_ariBraLinks[k][m] == base_node) { m_ariBraLinks[k][m] = links[1]; break; } + } } + else { + assert(k>=0); + m_Node[links[1]]->m_viLink[j] = links[0]; + m_Node[links[1]]->m_viBranch[j] = k; + } } } } + else { + if(i==0) { + m_Node[links[0]]->m_viLink[0] = links[1]; + k = m_Node[links[0]]->m_viBranch[0]; + SetB(k,total,true,true); + FOR(m,2) { // Reassign branch links + if(m_ariBraLinks[k][m] == base_node) { m_ariBraLinks[k][m] = links[1]; break; } + } } + else { + m_Node[links[1]]->m_viLink[0] = links[0]; + // This shouldn't have to be here... + SetB(m_Node[links[1]]->m_viBranch[0],total,true,true); + m_Node[links[1]]->m_viBranch[0] = k; + } } } + // Tidy up the basenode + FOR(i,3) { + if(m_Node[base_node]->m_viLink[i] == RemNode) { continue; } + if(m_Node[base_node]->m_viLink[i] == links[0]) { // This is the branch thats removed + m_Node[base_node]->m_viBranch[i] = -1; + } + m_Node[base_node]->m_viLink[i] = -1; + } + OrderNode(); + BuildBraLinks(false); + m_bFastCalcOK = false; + ValidateTree(); + return 0; +} + +// AddSeq +// ------ +// For a sequence to be added it requires a bit of tree structure associated with it: +// 1) two nodes: one associated with the sequence and one internal +// 2) two branches: linking leaf and internal node, and a spare. +// BranchProp = relative position in branch (from low to high node nums) +// This must be prepared before passing to the function +// +// Returns -1 if the branch is the same as one of the ones in the cut section +int CTree::AddSeq(int SNum,int Bra, double Prop) { + int i,j,braN, braB, leafB,inN1, inN2,okay1, okay2;; + double Len = (1.0 - Prop) * B(Bra); + // Check entry conditions w.r.t. tree and initialise some variables + assert(SNum < m_iNoSeq && m_Node[SNum] != NULL && SNum < m_iNoNode); + assert(m_Node[SNum]->m_NodeType == leaf && (!m_Node[SNum]->m_viLink.empty()) && (!m_Node[SNum]->m_viBranch.empty())); + braN = m_Node[SNum]->m_viLink[0]; leafB = m_Node[SNum]->m_viBranch[0]; + assert(braN < m_iNoNode && m_Node[braN]->m_NodeType == branch && m_Node[braN]->m_viLink.size() == 3 && m_Node[braN]->m_viBranch.size() == 3); + braB = m_Node[braN]->m_viBranch[1]; + assert(m_Node[braN]->m_viLink[0] == SNum && m_Node[braN]->m_viLink[1] == -1 && m_Node[braN]->m_viLink[2] == -1); + assert(m_Node[braN]->m_viBranch[0] == leafB); + assert(m_Node[braN]->m_viBranch[0] == leafB && m_Node[braN]->m_viBranch[1] != -1 && m_Node[braN]->m_viBranch[2] == -1); + inN1 = m_ariBraLinks[Bra][0]; inN2 = m_ariBraLinks[Bra][1]; + assert(IsNode(inN1) && IsNode(inN2)); + if(inN1 > inN2) { i = inN1; m_ariBraLinks[Bra][0] = inN1 = inN2; m_ariBraLinks[Bra][1] = inN2 = i; } + + // Adjust some branch lengths + MulB(Bra,Prop,true,true); + SetB(braB,Len,true,true); + // Adjust the BraN + m_Node[braN]->m_viLink[1] = inN2; m_Node[braN]->m_viLink[2] = inN1; + m_Node[braN]->m_viBranch[2] = Bra; + // Adjust inN1 + FOR(i,(int)m_Node[inN1]->m_viLink.size()) { if(m_Node[inN1]->m_viLink[i] == inN2) { break; } } assert(i!=m_Node[inN1]->m_viLink.size()); + m_Node[inN1]->m_viLink[i] = braN; + FOR(j,2) { if(m_ariBraLinks[Bra][j] != inN1) { break; } } assert(j!=2); + m_ariBraLinks[Bra][j] = braN; + // Adjust inN2 + FOR(i,(int)m_Node[inN2]->m_viLink.size()) { if(m_Node[inN2]->m_viLink[i] == inN1) { break; } } assert(i!=m_Node[inN2]->m_viLink.size()); + m_Node[inN2]->m_viLink[i] = braN; + m_Node[inN2]->m_viBranch[i] = braB; + FOR(j,2) { if(m_ariBraLinks[braB][j] != braN) { break; } } assert(j!=2); + m_ariBraLinks[braB][j] = inN2; + // Check node have been assigned correctly + okay1 = okay2 = 0; + assert(m_ariBraLinks[leafB][0] == SNum && m_ariBraLinks[leafB][1] == braN); + assert(m_ariBraLinks[Bra][0] == inN1 && m_ariBraLinks[Bra][1] == braN); + okay1 = 0; FOR(i,2) { if(m_ariBraLinks[braB][i] == inN2) { okay1++; } else if(m_ariBraLinks[braB][i] == braN) { okay1++; } } + assert(okay1 == 2); + okay1 = okay2 = 0; + FOR(i,3) { assert(IsNode(m_Node[braN]->m_viLink[i]) && IsBra(m_Node[braN]->m_viBranch[i])); } + FOR(i,(int)m_Node[inN1]->m_viLink.size()) { + if(m_Node[inN1]->m_viLink[i] == braN) { okay1++; } + assert(IsNode(m_Node[inN1]->m_viLink[i]) && IsBra(m_Node[inN1]->m_viBranch[i])); + } + FOR(i,(int)m_Node[inN2]->m_viLink.size()) { + if(m_Node[inN2]->m_viLink[i] == braN) { okay2++; } + assert(IsNode(m_Node[inN2]->m_viBranch[i]) && IsBra(m_Node[inN2]->m_viBranch[i])); + + } + assert(okay1 = 1 && okay2 == 1); + OrderNode(); + m_bFastCalcOK = false; + ValidateTree(); + return 1; +} + +bool CTree::IsNode(int Node) { + if(Node >= 0 && Node < m_iNoNode) { return true; } + return false; +} + +bool CTree::IsBra(int Bra) { + if(Bra >= 0 && Bra < m_iNoBra) { return true; } + return false; +} + +bool CTree::GoodBra(int Bra) { + int i; + FOR(i,2) { + if(m_ariBraLinks[Bra][i] == -1) { return false; } + if(GoodNode(m_ariBraLinks[Bra][i]) == false) { return false; } + } + return true; +} + +bool CTree::GoodNode(int Node) { + int NodNum = Node; + if(Node < m_iNoSeq) { NodNum = m_Node[Node]->m_viLink[0]; } + if(m_Node[NodNum]->m_viLink[2] == -1) { return false; } + return true; +} + +int CTree::NoLeafLink(int N) { + int i,count = 0; + FOR(i,NoLinks(N)) { if(NodeLink(N,i) < m_iNoSeq) { count++; } } + return count; +} + +bool CTree::IsCutTree() { + int i; + FOR(i,m_iNoBra) { + if(m_ariBraLinks[i][0] == -1 || m_ariBraLinks[i][1] == -1) { return true; } + } + return false; +} + +int CTree::GetStart(bool replace) { + int i; + FOR(i,m_iNoSeq) { + if(m_Node[i] == NULL) { continue; } + if((int)m_Node[i]->m_viLink.size() !=1 ) { continue; } + if(!InRange(m_Node[i]->m_viLink[0],0,m_iNoNode)) { continue; } + if((int)m_Node[m_Node[i]->m_viLink[0]]->m_viLink.size() != 3) { continue; } + if(m_Node[m_Node[i]->m_viLink[0]]->m_viLink[2]!=-1) { break; } + } + assert(i != m_iNoSeq); + if(replace == true) { m_iStartCalc = i; } + return i; +} + +double CTree::GetTreeLength(bool first, int NTo, int NFr) { + int i,NodeNum; + static double length = 0; + // Initialise function + if(first == true) { + length = 0; + if(m_iNoSeq == 2) { return B(0); } // Always just the first branch for 2 sequences + return GetTreeLength(false,GetStart(false),-1); + } + assert(NTo >= 0 && NTo < m_iNoNode); + if(m_Node[NTo]->m_NodeType == branch) { NodeNum = 3; } else { NodeNum = 1; } + FOR(i,NodeNum) { + if(m_Node[NTo]->m_viLink[i] == NFr) { continue; } + length += B(m_Node[NTo]->m_viBranch[i]); + GetTreeLength(false,m_Node[NTo]->m_viLink[i],NTo); + } + return length; +} + +int CTree::BranchSets(int BranchNum, vector *Left, vector *Right) { + vector temp, temp2; + assert(Left != NULL && Right != NULL && BranchNum < m_iNoBra); + BuildBraLinks(false); + GetBraSets(m_ariBraLinks[BranchNum][0], m_ariBraLinks[BranchNum][1],Left, true); + GetBraSets(m_ariBraLinks[BranchNum][1], m_ariBraLinks[BranchNum][0],Right, true); + // Sort the nodes + if((int)Left->size() > 1) { sort(Left->begin(),Left->end()); } else if(Left->empty()) { return -1; } else if(Left->at(0) == -1) { return -1; } + if((int)Right->size() > 1) { sort(Right->begin(),Right->end()); } else if(Right->empty()) { return -1; } else if(Right->at(0) == -1) { return -1; } + // Ensure the one with sequence 0 in is in the left + if(Left->at(0) != 0) { temp = *Left; *Left = *Right; *Right = temp; } + if(Left->size() > Right->size()) { return (int)Left->size(); } + temp.clear(); temp2.clear(); + return (int) Right->size(); +} + +void CTree::GetBraSets(int NTo, int NFr, vector *List, bool First) { + int i; + if(NTo == -1) { return; } // Avoid empty links in the tree + if(First == true) { List->clear(); if(NTo < m_iNoSeq) { List->push_back(NTo); } } + FOR(i,(int)m_Node[NTo]->m_viLink.size()) { + // If already seen then continue + if(m_Node[NTo]->m_viLink[i] == NFr || m_Node[NTo]->m_viLink[i] == -1) { continue; } + // If an internal node then descend + else if(m_Node[NTo]->m_viLink[i] >= m_iNoSeq) { GetBraSets(m_Node[NTo]->m_viLink[i],NTo,List,false); } + // If an external node then store + else { List->push_back(m_Node[NTo]->m_viLink[i]); } +} } + +// Order the links in a node +void CTree::OrderNode(int NodeNum, bool DoBraToo) { + int i,j,k,tempL,tempB, Valj, Valk; + for(i=m_iNoSeq; im_viLink.size() == m_Node[i]->m_viBranch.size()); + FOR(j,(int)m_Node[i]->m_viLink.size()) { + for(k=j;k<(int)m_Node[i]->m_viLink.size();k++) { + Valj = m_Node[i]->m_viLink[j]; Valk = m_Node[i]->m_viLink[k]; + if(Valj == -1) { Valj = BIG_NUMBER; } + if(Valk == -1) { Valk = BIG_NUMBER; } + if(Valj> Valk) { + tempL = m_Node[i]->m_viLink[j]; tempB = m_Node[i]->m_viBranch[j]; + m_Node[i]->m_viLink[j] = m_Node[i]->m_viLink[k]; m_Node[i]->m_viBranch[j] = m_Node[i]->m_viBranch[k]; + m_Node[i]->m_viLink[k] = tempL; m_Node[i]->m_viBranch[k] = tempB; + } } } + if(m_Node[i]->m_viLink.size() == 3) { + if(m_Node[i]->m_viLink[1] == -1 && m_Node[i]->m_viLink[2] == -1) { + if(m_Node[i]->m_viBranch[2] > m_Node[i]->m_viBranch[1]) { + tempB = m_Node[i]->m_viBranch[1]; + m_Node[i]->m_viBranch[1] = m_Node[i]->m_viBranch[2]; + m_Node[i]->m_viBranch[2] = tempB; + } } } } } + if(DoBraToo == true) { // If need to do branches + FOR(i,m_iNoBra) { + if( ( m_ariBraLinks[i][0] > m_ariBraLinks[i][1] && m_ariBraLinks[i][1] != -1) || m_ariBraLinks[i][0] == -1) { + j = m_ariBraLinks[i][0]; + m_ariBraLinks[i][0] = m_ariBraLinks[i][1]; + m_ariBraLinks[i][1] = j; +} } } } + +/////////////////////////////////////////////////////////////////// +// Tree rooting and unrooting options +void CTree::Unroot() { + if(!IsRooted()) { return; } + if(!InRange(m_iRootNode,0,m_iNoNode)) { Error("\nTrying to CTree::Unroot when tree not ready\n\n"); } + assert(m_Node[m_iRootNode]->m_NodeType == root); + int i,j, branch = NodeBra(m_iRootNode,0), pLeft = NodeLink(m_iRootNode,0), pRight = NodeLink(m_iRootNode,1); + int braLeft = NodeBra(m_iRootNode,0), braRight = NodeBra(m_iRootNode,1); + CNode ** ppNodeStore; + // Correct the branch length and remove the right hand branch + m_vdBra[pLeft] = m_vdBra[braLeft] + m_vdBra[braRight]; + m_vdBra.erase(m_vdBra.begin() + pRight); + // Fix left node + FOR(i,NoLinks(pLeft)) { + if(NodeLink(pLeft,i) == m_iRootNode) { + ReplaceNodeLinkElement(pLeft,i,pRight); + ReplaceNodeBraElement(pLeft,i,branch); + break; + } } + assert(i != NoLinks(pLeft)); + // Fix right node + FOR(i,NoLinks(pRight)) { + if(NodeLink(pRight,i) == m_iRootNode) { + ReplaceNodeLinkElement(pRight,i,pLeft); + ReplaceNodeBraElement(pRight,i,branch); + break; + } } + assert(i != NoLinks(pRight)); + // Now remove the redundant node + ppNodeStore = new CNode*[m_iNoNode - 1]; + j = 0; FOR(i,m_iNoNode) { // Removes the node + if(i==m_iRootNode) { continue; } + ppNodeStore[j++] = m_Node[i]; + } + for(i=0;i > L1, L2; + vector L,R; + // Check some entry conditions + if(NoSeq() != Tree.NoSeq()) { return -1; } + // Get this trees sets + FOR(i,NoBra()) { BranchSets(i,&L,&R); L1.push_back(L); Tree.BranchSets(i,&L,&R); L2.push_back(L); } + // Get the distance + Dist = NoBra(); + FOR(i,NoBra()) { + FOR(j,NoBra()) { if(Compare(&L1[i],&L2[j]) == true) { Dist--; break; } } + } + return Dist; +} + +//////////////////////////////////////////////////////////////////////////// +// Function to check whether a subtree is compatible with the current tree +bool CTree::IsCompatible(CTree &SubTree) { + // Check input information + assert(SubTree.NoSeq() <= NoSeq()); // Check numbers of sequences are compatible + // Build the sets of splits + BuildSplits(); SubTree.BuildSplits(); + return SplitsCompatible(m_vSplits,NoSeq(),SubTree.m_vSplits,SubTree.NoSeq()); +} + +bool SplitsCompatible(vector Split1, int S1_seq, vector Split2, int S2_seq) { + int i,j; + // Check some entry conditions + assert((int)Split1.size() >= (int)Split2.size()); + assert((int)Split1.size() == (S1_seq * 2) -3); + assert((int)Split2.size() == (S2_seq * 2) -3); +// cout << "\n>>>>>>>> INTO SplitsCompatible <<<<<<<<<<<"; + // Compare the subtree's splits to the full tree one-by-one to check they're compatible + FOR(i,(int)Split2.size()) { + if(Split2[i].Left.empty() || (int)Split2[i].Left.size() < 2 || (int)Split2[i].Right.size() < 2 ) { continue; } // Skip empty splits or trivial splits +// cout << "\nTesting SubTree["<= (int)S2.Left.size() + (int)S2.Right.size()); + // Compares S2.Left to S1.Left and S1.Right + // 1. S2.Left vs S1.Left + count = 0; + FOR(i,(int)S1.Left.size()) { + if(S1.Left[i] > S2.Left[count]) { break; } // Always in order + if(S1.Left[i] == S2.Left[count]) { count++; if(count == (int)S2.Left.size()) { MatchLeft = true; break; } } // Match + } + // 2. S2.Left to S1. Right + if(!MatchLeft) { + count = 0; + FOR(i,(int)S1.Right.size()) { + if(S1.Right[i] > S2.Left[count]) { break; } // Always in order + if(S1.Right[i] == S2.Left[count]) { count++; if(count == (int)S2.Left.size()) { break; } } // Match + } + if(count != (int) S2.Left.size()) { return false; } // Neither Left nor right match + } + // Now checks right + if(MatchLeft) { + count = 0; + FOR(i,(int)S1.Right.size()) { + if(S1.Right[i] > S2.Right[count]) { break; } // Always in order + if(S1.Right[i] == S2.Right[count]) { count++; if(count == (int)S2.Right.size()) { return true; } } // Match + } + } else { + // 2. S2.Right vs S1.Left + count = 0; + FOR(i,(int)S1.Left.size()) { + if(S1.Left[i] > S2.Right[count]) { return false; } // Always in order + if(S1.Left[i] == S2.Right[count]) { count++; if(count == (int)S2.Right.size()) { return true; } } // Match + } + } + return false; +} + +// ofstream operator for tree +///////////////////////////////////////////////////// +// NOTE: The code for this section is hideous +// and badly described. Can't be bothered +// to debug however 'cause speed not issue +// and it works +///////////////////////////////////////////////////// + +ostream& operator<<(ostream& os, CTree &Tree) { + Tree.ValidateTree(); + if(Tree.m_iNoSeq == 2) { + os << "("; + if(Tree.m_bOutName && !Tree.m_vsName.empty()) { os << Tree.m_vsName[0]; } else { os << "1"; } + if(Tree.m_iOutBra == 1) { os << ":" << Tree.B(0); } + os << ","; + if(Tree.m_bOutName && !Tree.m_vsName.empty()) { os << Tree.m_vsName[1]; } else { os << "2"; } + if(Tree.m_iOutBra == 1) { os << ":0.0"; } + os << ")"; + return os; + } + os << "("; + if(Tree.IsCutTree()) { + if(Tree.NoLinks(Tree.StartCalc()) == 0) { Error("Trying to start calc from bad node\n"); } + Tree.OutNode(-1,Tree.m_Node[Tree.StartCalc()]->m_viLink[0],os); + } else { + if(Tree.m_bRooted) { Tree.OutNode(-1,Tree.m_iRootNode,os); } + else { Tree.OutNode(-1,Tree.m_Node[0]->m_viLink[0],os); } + } + os << ");"; + return os; +} + +ostream& CTree::OutNode(int FromNode, int ToNode, ostream &os) { + int i, count; +#if TREE_DEBUG + static vector OK; + if(FromNode == -1) { OK.clear(); OK.assign(NoNode(),false); } + if(OK[ToNode] == true) { cout << "\nTree error: "; OutDetail(cout); exit(-1); } + OK[ToNode] = true; +#endif + // If an internal node + if(m_Node[ToNode]->m_NodeType == branch) { + // First visit to node gives an open parenthesis + if(FromNode != -1) { os << "("; } // Ensures trifurcation at root + // Organise node visits + count = 0; FOR(i,3) { + if(m_Node[ToNode]->m_viLink[i] == FromNode || m_Node[ToNode]->m_viLink[i] == -1) { continue; } + if(count == 2) { os << ","; } // For first node put in the extra ',' + OutNode(ToNode,m_Node[ToNode]->m_viLink[i],os); + if(count == 0) { os << ","; } + count++; + } + // The return visit gives the closing parenthesis and, if required, the branch length + if(FromNode != -1) { os << ")"; } // Ensures trifurcation at root + if(FromNode != -1) { OutBranch(ToNode,FromNode,os); } + } else if(m_Node[ToNode]->m_NodeType == leaf) { // If an external node + if(ToNode < m_iNoSeq && m_vsName.size() == m_iNoSeq) { if(m_bOutName) { os << m_vsName[ToNode]; } else { os << ToNode+1; } } else { os << ToNode + 1; } + OutBranch(ToNode,NodeLink(ToNode,0),os); + if(FromNode == -1) { + if(NodeLink(ToNode,0) < m_iNoSeq && m_vsName.size() == m_iNoSeq) { if(m_bOutName) { os << "," << m_vsName[NodeLink(ToNode,0)]; } else { os << "," << NodeLink(ToNode,0); } } + else { os << "," << NodeLink(ToNode,0) + 1; } + if(m_iOutBra==1) { os << ":0.0"; } + } + } else if(m_Node[ToNode]->m_NodeType == root) { + // First visit to node gives an open parenthesis + if(FromNode != -1) { os << "("; } // Ensures trifurcation at root + // Organise node visits + count = 0; FOR(i,2) { + if(m_Node[ToNode]->m_viLink[i] == FromNode || m_Node[ToNode]->m_viLink[i] == -1) { continue; } + if(count == 2) { os << ","; } // For first node put in the extra ',' + OutNode(ToNode,m_Node[ToNode]->m_viLink[i],os); + if(count == 0) { os << ","; } + count++; + } + // The return visit gives the closing parenthesis and, if required, the branch length + if(FromNode != -1) { os << ")"; } // Ensures trifurcation at root + if(FromNode != -1) { OutBranch(ToNode,FromNode,os); } + } + return os; +} +ostream& CTree::OutBranch(int ToNode, int FromNode, ostream &os) { + int i; + switch(m_iOutBra) { + case 0: break; // No branch + case 1: // Simple branch length + if(m_iNoSeq == 2 && ToNode == 1) { os << ":0.0"; break; } + FOR(i,(int)m_Node[ToNode]->m_viLink.size()) { if(m_Node[ToNode]->m_viLink[i] == FromNode) { break; } } + assert(i != m_Node[ToNode]->m_viLink.size()); + os << ":" << B(m_Node[ToNode]->m_viBranch[i]); + if(m_bOutLabel && BranchLabels()) { os << "#" << m_viBranchLabels[m_Node[ToNode]->m_viBranch[i]]; } + break; + case 2: // Branch label + FOR(i,(int)m_Node[ToNode]->m_viLink.size()) { if(m_Node[ToNode]->m_viLink[i] == FromNode) { break; } } + assert(i != m_Node[ToNode]->m_viLink.size()); + os << ":b" << m_Node[ToNode]->m_viBranch[i]; + if(m_bOutLabel && BranchLabels()) { os << "#" << m_viBranchLabels[m_Node[ToNode]->m_viBranch[i]]; } + break; + default: + Error("Unknown request for m_iOutBra...\n\n"); + }; + return os; +} + +bool CTree::OutDetail(ostream &os, bool ForceExit) { + int i; + os << "\nTree has " << m_iNoNode << " nodes and " << m_iNoBra << " branches"; + os << "\nCalculations start at Node: " << m_iStartCalc; + os << "\nNodes: "; + FOR(i,m_iNoNode) { os << "\n\tNode["< CTree::ConstOut() { + int i,j; + vector List; + FOR(i,m_iNoSeq) { FOR(j,i) { List.push_back(NodeDist(i,j)); } } + assert(List.size() == (((m_iNoSeq * m_iNoSeq) - m_iNoSeq) / 2)); + return List; +} + +int CTree::NodeDist(int Node1, int Node2, int NodeFrom) { + int i; + static int dist; + static bool Fix; + if(NodeFrom == -1) { + if(Node1 == Node2) { return 0; } + Fix = false; dist = 0; + } + FOR(i,(int)m_Node[Node1]->m_viLink.size()) { + if(m_Node[Node1]->m_viLink[i] == NodeFrom || m_Node[Node1]->m_viLink[i] == -1) { continue; } + dist++; + if(Node2 == m_Node[Node1]->m_viLink[i]) { Fix=true; return dist; } + NodeDist(m_Node[Node1]->m_viLink[i],Node2,Node1); + if(Fix == true) { return dist; } + } + return --dist; +} + +int CTree::BranchDist(int B1, int B2, bool AllowZero) { + int i,j; + int mindist = BIG_NUMBER, dist; + if((B1 < 0 || B1 > NoBra()) || (B2 < 0 || B2 > NoBra())) { return 1; }; + FOR(i,2) { + if(BraLink(B1,i) == -1) { continue; } + FOR(j,2) { + if(BraLink(B2,j) == -1) { continue; } + dist = NodeDist(BraLink(B1,i),BraLink(B2,j)); + if(dist < mindist) { mindist = dist; } + } } + if(mindist == BIG_NUMBER) { return -1; } + return mindist; +} + +void CTree::AssignNodeType(int N, ENodeType Type) { + assert(InRange(N,0,NoNode())); + switch(Type) { + case branch: + assert(NoLinks(N) == 3); + m_Node[N]->m_NodeType = branch; + break; + case leaf: + assert(NoLinks(N) == 1); + m_Node[N]->m_NodeType = leaf; + break; + case root: + assert(NoLinks(N) == 2); + m_Node[N]->m_NodeType = root; + break; + default:; + Error("Trying to assign a node to unknown type of ENodeType\n"); + } +} + +/////////////////// Tree based pairwise distance functions ////////////////////////// +// Get only the distances between the leaf nodes +vector CTree::GetTreePW() { + int i,seq; + vector dist(m_iNoSeq*m_iNoSeq,0); + vector pdist(m_iNoSeq,0); + // Get the distances using recursive function + FOR(seq,m_iNoSeq) { + PWDistSub(seq,-1,&pdist); + FOR(i,m_iNoSeq) { dist[(seq*m_iNoSeq) + i] = pdist[i]; } + } + return dist; +} + +// Get distances between leaf and internal node distances +vector CTree::GetAllTreePW() { + int i,j; + bool LabelledNodes = false; + vector dists, retdist; + dists.resize(m_iNoNode); retdist.resize(m_iNoNode*m_iNoNode); + + FOR(i,m_iNoNode) { + // Get simple pairwise distances + // Get node to tip distances + if(i < m_iNoSeq) { j = i; } else { + FOR(j,m_iNoNode) { if(m_Node[j]->m_iInternalNodeNum == i+1) { LabelledNodes = true; break; } } + if(j == m_iNoNode) { j = i; if(LabelledNodes == true) { cout << "\nWarning: Tree::GetAllTreePW() some nodes labelled whilst some are not..."; } } + } + PWDistSub(j,-1,&dists,true); + FOR(j,m_iNoNode) { retdist[(i*m_iNoNode)+j] = dists[j]; } + } +// cout << "\nThe node calculations are:"; FOR(i,m_iNoNode) { cout << "\nNode["< *d, bool DoInternalNodes) { + int i; + static double dist; + // Initialise if first node + if(NodeFrom == -1) { + dist = 0.0; + assert((DoInternalNodes == false && d->size() == m_iNoSeq) || (DoInternalNodes == true && d->size() == m_iNoNode)); + FOR(i,(int)d->size()) { d->at(i) = -1; } + } + // If a terminal node then store the distance + if(NodeTo < m_iNoSeq) { d->at(NodeTo) = dist; } else if(DoInternalNodes == true) { d->at(NodeTo) = dist; } + // recurse down tree + FOR(i,(int)m_Node[NodeTo]->m_viLink.size()) { + if(m_Node[NodeTo]->m_viLink[i] == NodeFrom || m_Node[NodeTo]->m_viLink[i] == -1) { continue; } + dist += B(m_Node[NodeTo]->m_viBranch[i]); + PWDistSub(m_Node[NodeTo]->m_viLink[i],NodeTo,d,DoInternalNodes); + dist -= B(m_Node[NodeTo]->m_viBranch[i]); + } +} + +// Recursive function that collects the nodes of exactly NodeDepth, will also include leaf nodes if less than NodeDepth if GetLess == true +vector CTree::GetNodesOfDepth(int InitNode, int NodeDepth, bool GetLess, vector *NodesFrom, vector *NodeCov, vector *ExtBra,int NodeFr, bool First) { + static vector vNodes; + static int CurDepth; + int i; + // Check entry conditions + assert(InRange(InitNode,0,m_iNoNode) && InRange(NodeFr,-1,m_iNoNode) && (NodeDepth > 0)); + // Initialise if required + if(First == true) { vNodes.clear(); CurDepth = 0; if(NodesFrom != NULL) { NodesFrom->clear(); } if( NodeCov != NULL) { NodeCov->clear(); } } + // Get the nodes if the node depth is reached + if(CurDepth == NodeDepth) { + vNodes.push_back(InitNode); + if(NodesFrom != NULL) { NodesFrom->push_back(NodeFr); } + // Find Branch + if(ExtBra != NULL) { + FOR(i,m_iNoBra) { + if( (m_ariBraLinks[i][0] == InitNode && m_ariBraLinks[i][1] == NodeFr) || (m_ariBraLinks[i][0] == NodeFr && m_ariBraLinks[i][1] == InitNode) ) { + ExtBra->push_back(m_vdBra[i]); break; + } } + if(i == m_iNoBra) { Error("\nCannot find branch in GetNodesOfDepth...\n"); } + } + return vNodes; + } + // Get the covered nodes + if(NodeCov != NULL) { + if(!IsIn(InitNode,*NodeCov) && InitNode >= m_iNoSeq) { NodeCov->push_back(InitNode); } + } + // Check for leaf nodes + if(InitNode < m_iNoSeq) { + if(GetLess == true) { + vNodes.push_back(InitNode); + if(NodesFrom != NULL) { NodesFrom->push_back(NodeFr); } + // Find Branch + if(ExtBra != NULL) { + FOR(i,m_iNoBra) { + if( (m_ariBraLinks[i][0] == InitNode && m_ariBraLinks[i][1] == NodeFr) || (m_ariBraLinks[i][0] == NodeFr && m_ariBraLinks[i][1] == InitNode) ) { + ExtBra->push_back(m_vdBra[i]); break; + } } + if(i == m_iNoBra) { Error("\nCannot find branch in GetNodesOfDepth...\n"); } + } } + return vNodes; + } + // Otherwise descend the tree by looping through the nodes + CurDepth++; + FOR(i,(int)m_Node[InitNode]->m_viLink.size()) { + if(m_Node[InitNode]->m_viLink[i] == NodeFr || m_Node[InitNode]->m_viLink[i] == -1) { continue; } + GetNodesOfDepth(m_Node[InitNode]->m_viLink[i],NodeDepth,GetLess,NodesFrom,NodeCov,ExtBra,InitNode,false); + } + CurDepth--; + return vNodes; +} + +////////////////////////////////////////////////////////////////////////////////////////// +// Centre-Point algorithms for SNAP + +// Branch centre point +vector CTree::BranchCP(int CP, int Depth, vector *NodeRem, vector *NodeCovered, vector *ExtBra) { + int i,j; + vector Nodes, temp, RetNode, NC; + NodeRem->clear(); if(NodeCovered != NULL) { NodeCovered->clear(); } + // Clean in preparation of new subtree + FOR(i,2) { + j = 0; if(i==0) { j = 1; } + Nodes = GetNodesOfDepth(m_ariBraLinks[CP][i],Depth,true,&temp,&NC,ExtBra,m_ariBraLinks[CP][j]); + assert(Nodes.size() == temp.size()); + // Store the nodes for the leafmap and the nodes from for calculating partial likelihoods + FOR(j,(int)Nodes.size()) { RetNode.push_back(Nodes[j]); NodeRem->push_back(temp[j]); } + if(NodeCovered != NULL) { FOR(j,(int)NC.size()) { if(!IsIn(NC[i],*NodeCovered)) { NodeCovered->push_back(NC[j]); } } } + } + return RetNode; +} +// Node centre point +vector CTree::NodeCP(int Node, int Depth, vector *NodeRem, vector *NodeCovered, vector *ExtBra) { + vector Nodes, NodeFr, NC; + // Clean the node lists + NodeRem->clear(); if(NodeCovered != NULL) { NodeCovered->clear(); } + // Clean in preparation of new subtree + Nodes = GetNodesOfDepth(Node,Depth,true,NodeRem,NodeCovered,ExtBra); + assert(Nodes.size() == NodeRem->size()); + return Nodes; +} + +// Tree replacement function +void CTree::ReplaceTreeCP(CTree *NT,vector LeafMap,vector NCover, bool VerifyBranchLinks) { + int i,j,nf, nt; + vector New2Old, Bran, NewL, NewB, NodeIntVals; + assert(LeafMap.size() + NCover.size() == NT->m_iNoNode); + New2Old = LeafMap; +#if TREE_DEBUG + int count + // Check entry conditions + FOR(i,LeafMap.size()) { // Check all leaves map to one and only one NCover + count = 0; + FOR(j,NoLinks(LeafMap[i])) { if(IsIn(NodeLink(LeafMap[i],j),NCover)) { count ++; } } + assert(count == 1); + if(count != 1) { cout << "\nCTree::ReplaceTreeCP -- Error in LeafList " << LeafMap[i] << "; count = " << count << "..."; OutDetail(); exit(-1); } + } + FOR(i,NCover.size()) { // Check all nodes covered link only to other nodes covered and leafmaps + count = 0; + FOR(j,NoLinks(NCover[i])) { if(IsIn(NodeLink(NCover[i],j),NCover) || IsIn(NodeLink(NCover[i],j),LeafMap)) { count ++; } } + assert(count == NoLinks(NCover[i])); + if(count != NoLinks(NCover[i])) { cout << "\nCTree::ReplaceTreeCP -- Error in NCover " << NCover[i] << "..."; OutDetail(); exit(-1); } + } +#endif + FOR(i,(int)NCover.size()) { + // Get branches + FOR(j,(int)m_Node[NCover[i]]->m_viBranch.size()) { + if(!IsIn(m_Node[NCover[i]]->m_viBranch[j],Bran)) { Bran.push_back(m_Node[NCover[i]]->m_viBranch[j]); } + } + // Get Node nums and delete them + New2Old.push_back(NCover[i]); NodeIntVals.push_back(m_Node[NCover[i]]->m_iInternalNodeNum); + delete m_Node[NCover[i]]; m_Node[NCover[i]] = NULL; + } + sort(Bran.begin(),Bran.end()); + assert(Bran.size() == NT->m_iNoBra); + // Apply the nodes + FOR(nf,NT->m_iNoNode) { + nt = New2Old[nf]; // Set NodeTo; + // Do leaf nodes + if(nfm_iNoSeq) { + // find the link that goes towards another covered node + FOR(i,(int)m_Node[nt]->m_viLink.size()) { if(IsIn(m_Node[nt]->m_viLink[i],New2Old)) { break; } } + assert(i != m_Node[nt]->m_viLink.size() && NT->m_Node[nf]->m_viLink.size() == 1); + // Set the link + m_Node[nt]->m_viLink[i] = New2Old[NT->m_Node[nf]->m_viLink[0]]; + // Set the branch + m_Node[nt]->m_viBranch[i] = Bran[NT->m_Node[nf]->m_viBranch[0]]; + } else { // Do internal nodes + NewL.clear(); NewB.clear(); + // Sort out the links + FOR(i,(int)NT->m_Node[nf]->m_viLink.size()) { + NewL.push_back(New2Old[NT->m_Node[nf]->m_viLink[i]]); + NewB.push_back(Bran[NT->m_Node[nf]->m_viBranch[i]]); + } + // Create the new node + assert(InRange( (int) (nf - NT->m_iNoSeq) ,(int) 0,(int) NodeIntVals.size())); + if(m_Node[nt] != NULL) { delete m_Node[nt]; } + m_Node[nt] = new CNode(NewL,NewB,NodeIntVals[nf - NT->m_iNoSeq]); + // Apply the branch lengths + FOR(i,(int)m_Node[nt]->m_viBranch.size()) { + SetB(m_Node[nt]->m_viBranch[i],NT->B(NT->m_Node[nf]->m_viBranch[i]),true,true); + } } } + BuildBraLinks(VerifyBranchLinks); + OrderNode(); + m_bFastCalcOK = false; + ValidateTree(); +} + + +/////////////////////////////////////////////////////////////////////////////// +// Gets the base pairwise distances for a tree without the subtree (defined by LeafMap & NCover) +// branches counted. +vector CTree::GetPartialTreeDist(vector LeafMap, vector > NBelow) { + int Leaf,Leaf2,i,j; + vector TreeDist(m_iNoSeq*m_iNoSeq,0); + vector dist(m_iNoSeq,0); + // Check entry conditons + assert(NBelow.size() == LeafMap.size()); + // Do the calculations + FOR(Leaf,(int)LeafMap.size()) { // Loop through the leaves and get the pairwise distances + if(NBelow[Leaf].size() == 1) { continue; } + PWDistSub(LeafMap[Leaf],-1,&dist); // Distances between LeafMap nodes and the true leaves + assert(dist.size() == m_iNoSeq); + // Loop through the Nodes Below and fill in the partial distances + FOR(i,(int)NBelow[Leaf].size()) { // This node + FOR(Leaf2,(int)LeafMap.size()) { + if( (NBelow[Leaf].size() == 1 && NBelow[Leaf2].size() == 1) || Leaf==Leaf2) { continue; } + FOR(j,(int)NBelow[Leaf2].size()) { + TreeDist[(m_iNoSeq*NBelow[Leaf][i]) + NBelow[Leaf2][j]] += dist[NBelow[Leaf][i]]; + TreeDist[(m_iNoSeq*NBelow[Leaf2][j]) + NBelow[Leaf][i]] += dist[NBelow[Leaf][i]]; + } } } + // Now do the other distances that don't span the subtree + FOR(i,(int)NBelow[Leaf].size()) { + PWDistSub(NBelow[Leaf][i],-1,&dist); + FOR(j,i) { + TreeDist[(m_iNoSeq*NBelow[Leaf][i])+NBelow[Leaf][j]] += dist[NBelow[Leaf][j]]; + TreeDist[(m_iNoSeq*NBelow[Leaf][j])+NBelow[Leaf][i]] += dist[NBelow[Leaf][j]]; + } } } + return TreeDist; +} + +////////////////////////////////////////////////////////////// +// Get full pairwise distances from a subtree and a set of +// partial RMSDs taken from GetPartialTreeDist(...) above +vector CTree::GetSubTreePW(vector LeafMap, vector > NBelow, vector Dist) { + int Leaf,Leaf2,i,j,OriNoSeq = 0; + vector SubDist = GetTreePW(); // The pairwise distances for the subtree + // Count the number of sequences + FOR(i,(int)NBelow.size()) { OriNoSeq += (int)NBelow[i].size(); } + // Check entry conditions + assert(LeafMap.size() == NBelow.size()); assert(Dist.size() == OriNoSeq * OriNoSeq); + // Do the distances + FOR(Leaf,(int)LeafMap.size()) { + FOR(i,(int)NBelow[Leaf].size()) { + FOR(Leaf2,Leaf) { + FOR(j,(int)NBelow[Leaf2].size()) { + Dist[(NBelow[Leaf][i] * OriNoSeq)+NBelow[Leaf2][j]] += SubDist[(Leaf*m_iNoSeq)+Leaf2]; + Dist[(NBelow[Leaf2][j] * OriNoSeq)+NBelow[Leaf][i]] += SubDist[(Leaf*m_iNoSeq)+Leaf2]; + } } } } + return Dist; +} + +int CTree::BestStartCalc() { + int i, BestBra = -1; + double temp = -1.0, BScore = -1.0; + // Find the most efficient place to perform the calculation + FOR(i,NoBra()) { temp = QuadB(i); if(temp > BScore) { BScore = temp; BestBra = i; } } + assert(InRange(BestBra,0,NoBra())); + m_iStartCalc = BestBra; + m_bFastCalcOK = false; + return m_iStartCalc; +} + +/////////////////////////////////////////////////////////////////////// +// Routines to build subtrees + +void CTree::BuildOriSubTree(CTree *T, vector NodesBool) { + int i; + vector NC; // Values for passing to another BuildOriSubTree routine + assert(NodesBool.size() == NoNode() + NoSeq()); + FOR(i,(int)NodesBool.size()) { if(NodesBool[i] == true) { NC.push_back(i); } } + BuildOriSubTree(T,NC); +} + +void CTree::BuildOriSubTree(CTree *T, vector NC) { + int i,j, NodCount, NPoint; + vector NCover,LeafMap,NFrom; // The values that NC shall be changed to + // Break up NC into the correct numbers + FOR(i,(int)NC.size()) { + if(NC[i] < NoSeq()) { LeafMap.push_back(NC[i]); NFrom.push_back(NodeLink(NC[i],0)); continue; } // Do leaf nodes + // Find out whether the node is a leaf node or an internal node + NodCount = 0; NPoint = -1; + FOR(j,NoLinks(NC[i])) { + if(IsIn(NodeLink(NC[i],j),NC)) { + NodCount++; NPoint = NodeLink(NC[i],j); + if(NodCount > 1) { break; } + } } + // NodCount == 1 means that it is a leaf node + if(NodCount == 1) { LeafMap.push_back(NC[i]); NFrom.push_back(NPoint); } + // NodCount > 1 means its a covered node + else if(NodCount > 1) { NCover.push_back(NC[i]); } + // NodCount == 0 is an error + else { Error("Error when constructing subtree in CTree::BuildOriSubTree(...)\n"); } + } + assert(LeafMap.size() == NFrom.size()); + BuildOriSubTree(T,LeafMap,NCover,NFrom); +} + +void CTree::BuildOriSubTree(CTree *RetTree, vector LeafMap, vector NCover, vector NFrom) { + int i,j,k,Node,NoSeq = (int)LeafMap.size(); + vector Link, Branch; Link.assign(3,-1); Branch.assign(3,-1); + vector BVal; BVal.assign(3,-1.0); + bool flag; + assert(!LeafMap.empty() && LeafMap.size() == NCover.size() + 2); + // Do only Nodes + RetTree->GetMemory(NoSeq); + FOR(i,NoSeq) { // LeafNodes (easy) + // Find the parent node + FOR(Node,(int)NCover.size()) { if(NCover[Node] == NFrom[i]) { break; } } + assert(Node != NCover.size()); + // Create the child node + RetTree->m_Node[i]->SetNode(Node + NoSeq,i,-1); + } + FOR(i,(int)NCover.size()) { // Branch nodes + FOR(j,3) { // Get each of the three links + // Find the real tree node + Node = m_Node[NCover[i]]->m_viLink[j]; + // Map back to the subtree nodes + flag = false; FOR(k,NoSeq) { if(LeafMap[k] == Node) { flag = true; break; } } + if(k == NoSeq) { + FOR(k,(int)NCover.size()) { if(NCover[k] == Node) { k += NoSeq; flag = true; break; } } + } + assert(flag == true); + Link[j] = k; + } + sort(Link.begin(),Link.end()); + RetTree->m_Node[i+NoSeq]->SetNode(Link[0],Link[1],Link[2],-1,-1,-1,-1); + } + // Now build the branches and their links + RetTree->BuildBranches(); + RetTree->BuildBraLinks(); + // Now copy branches lengths from original tree to RetTree + Link.clear(); Link = LeafMap; FOR(i,(int)NCover.size()) { Link.push_back(NCover[i]); } + FOR(i,(int)RetTree->m_vdBra.size()) { + RetTree->SetB(i,B(FindBra(Link[RetTree->m_ariBraLinks[i][0]],Link[RetTree->m_ariBraLinks[i][1]])),true); + } + RetTree->m_bReady = true; +} + +////////////////////////////////////////////////////////////////////////// +// Function used for investigating knots in the tree +vector > CTree::GetKnotClusters(vector INC, int ChangeRad) { + int i,j,k; + vector > Clusters; + vector v_C,v_Blank; + assert(INC.size() == NoNode()); + + // Get original clusters + for(i=NoSeq();i NewNames,bool Overwrite) { + if(!Names().empty() && Overwrite == false) { Error("\nTrying to overwrite names when not permitted in CTree::SetNames(...)\n"); } + if(NewNames.size() != m_iNoSeq) { Error("\nTrying to write " + int_to_string(NewNames.size()) + " NewNames to a tree with " + int_to_string(m_iNoSeq) + " sequences in CTree::SetNames\n"); } + m_vsName = NewNames; +} + + + +bool IsSameTree(CTree *T1, CTree *T2) { + assert(!T1->IsCutTree() && !T2->IsCutTree()); + int i,j; + vector < vector > L1; + vector L2,R2; + // Do the obvious checks first! + if(T1->NoSeq() != T2->NoSeq()) { return false; } + // Get branch sets for T1 + FOR(i,T1->NoBra()) { + L2.clear(); R2.clear(); + T1->BranchSets(i,&L2,&R2); + L1.push_back(L2); + } + // Now compare these branch sets to those of T2 + FOR(i,T2->NoBra()) { + L2.clear(); R2.clear(); T2->BranchSets(i,&L2,&R2); // Get branch sets for Tree 2 + // See if they match T1 + FOR(j,T1->NoBra()) { if(Compare(&L1[j],&L2)) { break; } } + if(j == T1->NoBra()) { return false; } + } + // If these conditions are all met, then they're the same tree + return true; +} + +///////////////////////////////////////////////////////////////////////////////// +// Functions for finding maximum tree length subtree containing exactly SubSeq +// from the tree FullTree +CTree FindGreedySubTree(CTree *FullTree, int SubSeq) { + int i,j,max,bra, best_seq, best_bra; + double dist, best_val; + vector PWDists; + vector SeqsToAdd; FOR(i,FullTree->NoSeq()) { SeqsToAdd.push_back(i); } + CTree CurTree; + string Start; + // Get starting pair of sequences + // 0. Need to ensure there are no real multifurcations because these can end up being resolved incorrectly. + FOR(i,FullTree->NoBra()) { + if(FullTree->B(i) < FLT_EPSILON) { FullTree->SetB(i,Random()*1.0E-4); } + } + // 1. Get PW distances + PWDists = FullTree->GetTreePW(); + // 2. Find the minimum pairwise distance and initialise the tree + dist = 0; max = 0; FOR(i,(int)PWDists.size()) { if(PWDists[i] > dist) { dist = PWDists[i]; max = i; } } + Start = "(" + int_to_string((max / FullTree->NoSeq())+1) + ":" + double_to_string(PWDists[max]) + "," + int_to_string((max % FullTree->NoSeq())+1) + ":0.0);"; + SeqsToAdd[max / FullTree->NoSeq()] = -1; SeqsToAdd[max % FullTree->NoSeq()] = -1; + CurTree.CreateTree(Start,FullTree->Names(),true,true,true); + // Progressively add the sequences to the tree +// FullTree->OutBra(); cout << "\nInitial starting tree:\n" << *FullTree; + PWDists = FullTree->GetAllTreePW(); + FOR(i,SubSeq-2) { +// cout << "\nDoing sequence add #" << i << "\n\tTrying"; + // Find which sequence gives greatest gain + best_seq = -1; best_val = -BIG_NUMBER; + FOR(j,(int)SeqsToAdd.size()) { +// cout << " " << SeqsToAdd[j] << flush; + if(SeqsToAdd[j]<0) { continue; } + // Initialise the tree part + dist = TravAddGreedy(&CurTree,(int)CurTree.StartCalc(),-1,SeqsToAdd[j],&PWDists,&bra); +// cout << "^" << flush; + if(dist > best_val) { best_seq = j; best_val = dist; best_bra = bra; } + } +// cout << "\n\tBest " << SeqsToAdd[best_seq] << flush; + GreedySeq2Tree(best_bra,SeqsToAdd[best_seq],&CurTree,&PWDists); SeqsToAdd[best_seq] = -1; +// CurTree.OutBra(); cout << "\n\t" << CurTree; + // Check compatibility between new and full tree + assert(FullTree->IsCompatible(CurTree)); + } + return CurTree; +} + +// Core in order tree traversal for adding a sequence +// Need to find the closest location to add a sequence +double TravAddGreedy(CTree *CurT, int To, int Fr, int Seq2Add, vector *PWDist, int *BestBra) { + int i; + static double best; // Current best value + double dist; + if(CurT->NodeType(To) == leaf) { // Leaf nodes + if(Fr == -1) { // Starting node + *BestBra = CurT->FindBra(To,CurT->NodeLink(To,0)); + best = GetDist(Seq2Add,To,CurT->NodeLink(To,0),PWDist); // Always store the first as best + } else { + dist = GetDist(Seq2Add,To,Fr,PWDist); + if(dist < best) { *BestBra = CurT->FindBra(To,Fr); best = dist; } + } + } else { // internal nodes + FOR(i,CurT->NoLinks(To)) { if(CurT->NodeLink(To,i) == Fr || CurT->NodeLink(To,i) == -1) { break; } } + assert(i != CurT->NoLinks(To)); + // If the node from isn't a leaf node do the internal calculation (i.e. avoids first node) + if(CurT->NodeType(CurT->NodeLink(To,i)) != leaf) { + dist = GetDist(Seq2Add,To,Fr,PWDist); + if(dist < best) { *BestBra = CurT->FindBra(To,Fr); best = dist; } + } } + // Do the looping + FOR(i,CurT->NoLinks(To)) { + if(CurT->NodeLink(To,i) == Fr || CurT->NodeLink(To,i) == -1) { continue; } + TravAddGreedy(CurT,CurT->NodeLink(To,i),To,Seq2Add, PWDist,BestBra); + } + return best; +} +// Get the distance obtained from adding a sequence (Add) between nodes (a,b) +double GetDist(int Add, int a, int b, vector *PWDist) { + int index = sqrt(PWDist->size()); + double d_ab = PWDist->at((a*index)+b), d_aAdd = PWDist->at((a*index)+Add),d_bAdd = PWDist->at((Add*index)+b), l = d_ab+d_aAdd+d_bAdd; + return 0.5 * (l - (2*d_ab)); + +} + +void GreedySeq2Tree(int Br, int Seq2Add, CTree *CurTree, vector *PWDist) { + int i,j,k; + int tmp, NLeft = CurTree->BraLink(Br,0), NRight = CurTree->BraLink(Br,1); + vector vtmp; + if(NRight > NLeft) { tmp = NRight; NRight = NLeft; NLeft = tmp; } + double Dleft = GetDist(NLeft,NRight,Seq2Add,PWDist), Dright = GetDist(NRight,NLeft,Seq2Add,PWDist), Dnew = GetDist(Seq2Add,NLeft,NRight,PWDist); + double prop; + // Error check and get the branch proportions +// cout << "\nDleft (" << Dleft << ") + Dright ("<< Dright << ") == " << Dleft + Dright << " cf. " << CurTree->B(Br) << " diff: " << fabs(CurTree->B(Br) - (Dleft + Dright)); + assert(fabs((Dleft + Dright) - CurTree->B(Br)) < 10-4); // Check branch lengths agree + prop = Dright / (Dleft + Dright); + // Create the new node and branch for adding to the tree + // 1. Find a spare node and branch + assert(CurTree->NoLinks(Seq2Add) == 0); // Check the node is empty + // Find the node that matches the expected distances + // Note this this not guarantee the correct mapping between the CurTree and original tree due to multifurcations, but it works! + for(i=CurTree->NoSeq();iNoNode();i++) { // Find spare internal node + if(CurTree->NoLinks(i) == 0) { + // Check 3 way distance condition matches + if(fabs(PWDist->at((CurTree->NoNode() * i) + NLeft) - Dleft) < FLT_EPSILON && + fabs(PWDist->at((CurTree->NoNode() * i) + NRight) - Dright) < FLT_EPSILON && + fabs(PWDist->at((CurTree->NoNode() * i) + Seq2Add) - Dnew) < FLT_EPSILON) { break; } + } } + assert(i != (int)CurTree->NoNode()); + // Find two spare branches + k = -1; FOR(j,CurTree->NoBra()) { if(!CurTree->GoodBra(j)) { if(k==-1) { k = j; } else { /*cout << "\nBranch[" <ReplaceNodeLink(Seq2Add,vtmp); // } replace nodes + vtmp.clear(); vtmp.push_back(Seq2Add); vtmp.push_back(-1); vtmp.push_back(-1); CurTree->ReplaceNodeLink(i,vtmp); // } + CurTree->AssignNodeType(Seq2Add,leaf); CurTree->AssignNodeType(i,branch); // } + vtmp.clear(); vtmp.push_back(j); CurTree->ReplaceNodeBra(Seq2Add,vtmp); // } + vtmp.push_back(k); vtmp.push_back(-1); CurTree->ReplaceNodeBra(i,vtmp); // } Do branches + CurTree->ReplaceBraLink(j,0,min(Seq2Add,i)); CurTree->ReplaceBraLink(j,1,max(Seq2Add,i)); // } Do links + CurTree->ReplaceBraLink(k,0,i); CurTree->SetB(j,Dnew); // } + + CurTree->AddSeq(Seq2Add,Br,prop); +} + +vector ReadTreeNames(string Tree) { + vector retNames; + for(int i = 0; i < Tree.size(); i++) { + if(Tree[i] == '(' || Tree[i] == ',') { + if(Tree[i+1] == '(') { continue; } + i++; + for(int j = i; j < Tree.size(); j++) { + if(Tree[j] == ',' || Tree[j] == ')' || Tree[j] == ':') { retNames.push_back(Tree.substr(i,j-i)); break; } + } + } + } + return retNames; +} + +////////////////// Tools ///////////// +bool FlipBool(bool V) { if(V == true) { return false; } return true; } +bool FlipBin(int i) { assert(i==0||i==1); if(i==0) { return 1; } return 0; } +string int_to_string(int num) { stringstream ss; ss << num; return ss.str(); } +string double_to_string(double num) { stringstream ss; ss << num; return ss.str(); } +ostream& Error(string str , ostream &os) { os << str; assert(0); exit(-1); }; + +vector Tokenise(string line) { + string buf; + stringstream in(line); + vector Toks; + Toks.~vector(); + while(in >> buf) { Toks.push_back(buf); } + return Toks; +} +vector Tokenise(string line, string Delim) { + size_t i = 0, j,j1; + vector Toks; + while(i != (int)line.size()) { + j = line.find(Delim,i+1); + if(j == string::npos) { j = j1 = (int)line.size(); } else { j1 = j+1; } + Toks.push_back(line.substr(i,j-i)); + i = j1; + } + return Toks; +} + +string EatWhiteSpace(string line) { + string::iterator it, temp; + IFOR(it,line) { if(isspace(*it)) { temp = it; it = it - 1; line.erase(temp); } } + return line; +} diff --git a/Tree.h b/Tree.h new file mode 100644 index 0000000..1c3cb30 --- /dev/null +++ b/Tree.h @@ -0,0 +1,336 @@ + +#ifndef __TREE_HEADER +#define __TREE_HEADER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "Random.h" + +using namespace::std; + +// Some stuff originally in tools.h +#define BIG_NUMBER 100000000 // A large number +vector Tokenise(string line); // Tokenise a string +vector Tokenise(string line, string Delim); // Tokenise a string according to delimiter Delim +string EatWhiteSpace(string line); +#define my_min(a,b) ((a)<(b)?(a):(b)) +#define my_max(a,b) ((a)>(b)?(a):(b)) +#define GET_MEM(var,type,size) var = new type[size]; assert(var != NULL); +#define DEL_MEM(var) if(var != NULL) { delete [] var; } var = NULL; +#define IFOR(iter, list) for(iter = list.begin(); iter!= list.end(); iter++) +#define FOR(i,n) for(i=0; i= 0) { printf(" %d",suffix); } + fflush(stdout); + if(count == 4) { count = 0; } +}; + +template ostream& operator<<(ostream & os, vector Vec) { int i; FOR(i,(int)Vec.size()) { os << Vec[i] << " "; } return os; } +template bool InRange(TRange Val, TRange LowerBound, TRange UpperBound) { return ( !(Val < LowerBound) && ( Val < UpperBound) ); } +template bool IsIn(TIsIn Val, vector Vec) { + if(Vec.empty()) { return false; } + if(find(Vec.begin(),Vec.end(),Val) == Vec.end()) { return false; } + return true; +} +template TSumType Sum(vector *Vec) { + TSumType Sum = 0; + int i; + FOR(i,(int)Vec->size()) { Sum += Vec->at(i); } + return Sum; +} +template bool Compare(vector *List1, vector *List2) { + int i; + bool same = true; + if(List1->size() != List2->size()) { return false; } + FOR(i,(int)List1->size()) { if(fabs((double) (List1->at(i) - List2->at(i))) > 1.0E-6) { same = false; break; } } + return same; +} +template +std::vector ordered(std::vector const& values) { + std::vector indices(values.size()); + std::iota(begin(indices), end(indices), static_cast(0)); + std::sort(begin(indices), end(indices), [&](int a, int b) { return values[a] < values[b]; }); + return indices; +} + + + + + +#define TREE_START 1 // Number that input trees first species start with +#define SET_BRANCH 0.1 // Default branch length + +enum ENodeType { branch, leaf, root}; + +struct SSplit{ int BrLabel; vector Left, Right; }; + +class CNode { +public: + // Member variables + //////////////////////// + int m_iNoLinks; + int m_iInternalNodeNum; + vector m_viLink; + vector m_viBranch; + ENodeType m_NodeType; + // Member functions + ////////////////////// + // Note parent node ALWAYS should be specified in the last branchx and linkx where applicable + // General constructor + CNode(int NoLinks = -1,int *LinkList = NULL); + // Constructor for internal nodes + CNode(int linka, int linkb, int linkc, int brancha, int branchb, int branchc, int IntVal = -1); + // Constructor for external nodes + CNode(int linka, int brancha, int IntVal = -1); + // Other constructor + CNode(vector Links, vector Branches, int IntVal); + // Copy constructor + CNode(const CNode &Node); + // Destructor function + virtual ~CNode(); + // Member functions specific for bifurcating trees where linkc is defined as a parent + void SetNode(int linka, int linkb, int linkc, int brancha, int branchb, int branchc, int IntVal); + void SetNode(int la, int lb, int ba, int bb, int IntVal); + void SetNode(int linka, int brancha, int IntVal); + // General functions for assigning links (No parent assumed) + void SetNode(int NoLinks, int *LinkList); + // Cleaning operation + void CleanNode(); + // Operator functions + CNode &operator=(const CNode &); +}; + +ostream& operator<<(ostream& os, const CNode &Node); + +class CTree +{ +public: + // Member functions + ///////////////////////////////// + + // Constructor functions + CTree(string TREE, vector Names, bool AllowFail = false, bool AllowSubTree = false); // Basic constructor + CTree(); + void CreateTree(string TREE,vector Names,bool CheckVar = true, bool AllowFail = false,bool AllowSubTree = false);// Underlying construction function + // Copy Constructor + CTree(const CTree &Tree); + // Destructor function + virtual ~CTree(); + // Memory functions + void CleanTree(); + void GetMemory(int NoSeq); + + // Access functions + //////////////////////////////////////// + // Main tree + inline bool IsRooted() { return m_bRooted; } // Returns whether the tree is rooted + inline int Root() { return m_iRootNode; } // Returns the root node index + inline int NoNode() { return m_iNoNode; } // Number of nodes in tree + inline int NoBra() { return m_iNoBra; } // Returns number of branches in tree + inline int NoOptBra() { return m_iNoOptBra; } // Returns the number of optimised branches in tree + inline int NoSeq() { return m_iNoSeq; } // Returns number of sequences in tree + inline int StartCalc() { return m_iStartCalc; } // Returns where the calculation starts + bool OldTree() { return m_bOldTree; } // Returns whether the tree is old or not + bool SetOldTree(bool V) { m_bOldTree = V; return V; } // Sets the m_bOldTree value + int BestStartCalc(); // Sets the m_iStartCalc to the best value + int SetStartCalc(int S) { m_iStartCalc=S;return S;} // Forces m_iStartCalc to a value -- power user function because can cause problems + bool FastCalcOK() { return m_bFastCalcOK; } // Returns whether the fast calc is okay or needs resetting + void SetFastCalcOK(bool V) { m_bFastCalcOK = V; } // Sets the m_bFastCalcOK flag + int SetOutBra(int V) { m_iOutBra = V; return V; }// Sets m_iOutBra + void OutBra() { m_iOutBra = 1; } // Sets m_iOutBra to output branches + void NoOutBra() { m_iOutBra = 0; } // Sets m_iOutBra to not output branches + void OutBraNum() { m_iOutBra = 2; } // Sets m_iOutBra to output branch numbers + void OutName() { m_bOutName = true; } // Set for outputting names + void NoOutName() { m_bOutName = false; } // Set for not outputting names + void OutLabel() { m_bOutLabel = true; } // Set for outputting labels on tree + void NoOutLabel() { m_bOutLabel = false; } // Set for not outputting labels on tree + void CreateBranchLabels(bool Force = true); // Create labels on branches that correspond with their numbers + bool Valid() { return m_bValid; } // Returns whether a valid constructor has been run + void ForceReady() { m_bReady = true; } // Makes the tree ready. + vector Names() { return m_vsName; } // Returns the vector of names + void SetNames(vector NewNames, bool Overwrite = false); // Set the names in the tree (e.g. for output) + bool BranchLabels() { if(m_viBranchLabels.empty()) { return false; } return true; } + vector Labels() { return m_viBranchLabels; } + int NoLabels() { return m_iNoBraLabels; } + // Branch + vector Branches(); // Returns a vector of the branch lengths + double B(int Branch); // Returns the value of branch length + double *OptimiserB(int Branch); // Returns the pointer to the value to be used in optimisation + double TreeLength(); // Returns the tree length; + double SetB(int Branch, double Value, bool Update = false, bool Rescale = false); // Set branch length Branch to Value + double MulB(int Branch, double Value, bool Update = false, bool Rescale = false); // Multiply branch length Branch by Value + double AddB(int Branch, double Value, bool Update = false, bool Rescale = false); // Add Value to branch length Branch + double QuadB(int Branch); // Get the average value of Branch and the (upto) 4 surrounding branches + inline int BraLink(int B, int L) { return m_ariBraLinks[B][L]; } // Get the L link for branch B + inline void ReplaceBraLink(int B,int L,int Val) { m_ariBraLinks[B][L]=Val;}; // Set the L link for branch B to Val; returns Val; + + // Node + inline int NoLinks(int N) { return (int) m_Node[N]->m_viLink.size(); } // Returns the # links in a node + int NodeLink(int N,int L) { return m_Node[N]->m_viLink[L]; } // Returns link L of node N + bool IsNodeLink(int N, int Val) { return IsIn(Val,m_Node[N]->m_viLink); } // Returns whether a Val is in Node N m_viLink + void ReplaceNodeLink(int N, vectorL); // Replace m_viLink of node N with L; + void ReplaceNodeLinkElement(int N, int Element, int Val); // Replace a single element of m_viLink in Node N with Val + int NodeBra(int N, int B) { return m_Node[N]->m_viBranch[B]; } // Returns Branchlink B of Node N + void ReplaceNodeBra(int N, vector B); // Replace m_viBranch of Node N with B + void ReplaceNodeBraElement(int N, int Element, int Val); // Replace a single element of m_viBranch in Node N with Val + ENodeType NodeType(int N) { return m_Node[N]->m_NodeType; } // Returns the NodeType of Node N + bool NodeNull(int N) { if(m_Node[N] == NULL) { return true; } return false; } // Returns whether a node is NULL or not + int NoLeafLink(int N); // Returns the number of leaf links for Node N + void AssignNodeType(int N, ENodeType Type); // Set the NodeType + + // Output functions + friend ostream& operator<<(ostream& os, CTree &Tree); // Standard output routine + bool OutDetail(ostream &os = cout, bool ForceExit = false); // Detailed output routine (nodes, branches, and tree) + + // Tree modification routines + void Unroot(); // If rooted, this function permanently unroots it + void OrderNode(int NodeNum = -1,bool DoBraToo = true); // Orders tree nodes to ascending numerical value (-1 does all) + int CutBranch(int Branch,int Link); // Cuts a branch: returns the branch on priority subtree where they used to be attached -- Subtrees are maintained; Link specifies which one is given priority + int RemoveBraNode(int Node); // Removes a node from a tree: returns the branch where link used to be attached -- NB: One Link must be set to -1 + int RemoveLeafNode(int RemNode); + int AddSeq(int SeqNum, int Branch, double BranchProp =0.5); // Adds a sequence to the tree + // SeqNum = sequence added; Branch = branch added to; BranchProp = relative position in branch (from low to high node nums) + int GetStart(bool replace = true); // Find branch in tree from which to recurse + double GetTreeLength(bool first = true, int NTo = -1, int NFr = -1); // Get remaining stuff in a tree + void BuildOriSubTree(CTree *T, vector NodesBool); // Returns subtree from an array of bools describing which nodes it covers + void BuildOriSubTree(CTree *T, vector LeafMap, vector NCover, vector NFrom); // Returns subtree from LeafMap and NodesCovered + void BuildOriSubTree(CTree *T, vector NodesCovered); // Returns subtree from the nodes it covers + void BuildBraLinks(bool Verify = true); // Builds the branch links from nodes + + // Function to create a consistent output for comparison + vector ConstOut(); + int NodeDist(int Node1, int Node2, int NodeFrom = -1); // Count number of branches separating nodes i and j; + int BranchDist(int Br1, int Br2, bool AllowZero = false); // Count the number of branches separating branches Br1 and Br2; !AllowZero means that very short branches will not be considered real branches + // Functions for comparing trees using Robinson-Foulds distance + int GetRFDist(CTree &Tree); // Standard comparison between two trees of same number of taxa + bool IsCompatible(CTree &SubTree); // Compares tree with #Seq <= *this->#Seq to check whether they are compatible (High level function accepting tree objects) + + // Functions for adding sequences to a subtree tree based on an existing full tree (Greedy algorithm for maximising tree length) + + + // Functions for testing properties of trees + bool IsNode(int Node); // Used for assert statements + bool IsBra(int Branch); // Used for assert statements + bool IsCutTree(); // Whether tree has had sequences removed... + bool GoodBra(int Branch); // Is an active branch in the tree + bool GoodNode(int Node); // Is an active node in the tree + + // Tree split-based functions + void BuildSplits(); // Build the set of splits associated with a tree. Current implementation always forces the rebuild + SSplit GetSplit(int Bra); // Return the split set for branch Bra + int BranchSets(int BranchNum, vector *Left, vector *Right); // Find the sets of leaf sequences that the branch splits + int FindBra(int Node_i, int Node_j); // Find branch linking Nodes i and j, returns -1 if none + // Returns the value of total number in the Left set (i.e. Left = m_ariBraLinks[0] ) + void GetBraSets(int NTo, int NFr, vector *List, bool First = true); + void OutSplits(ostream &os = cout); + // Functions to get pairwise distances from a tree + vector GetTreePW(); + vector GetAllTreePW(); + void PWDistSub(int NodeTo, int NodeFrom,vector *d,bool DoInternalNodes = false); + vector GetPartialTreeDist(vector LeafMap, vector > NBelow); + vector GetSubTreePW(vector LeafMap, vector > NBelow, vector BaseDist); + + // Function to get all of the nodes of a certain depth from a particular node; bool GetLess == true, will also return leaf nodes when they fall within this range + vector GetNodesOfDepth(int InitNode, int NodeDepth, bool GetLess, vector *NodesFrom = NULL, vector *NodeCov = NULL, vector *ExtBra = NULL ,int NodeFr = -1, bool First = true); + // Centering point functions used for snap + vector BranchCP(int CP, int depth, vector *NodeFr, vector *NodesCovered = NULL, vector *ExtBra = NULL); + vector NodeCP(int Node, int depth, vector *NodeFr, vector *NodesCovered = NULL, vector *ExtBra = NULL); + void ReplaceTreeCP(CTree *NT,vector LeafMap,vector NCover,bool VerifyBranchLinks = true); // Overwrites the current tree (from CPs) and puts NewTree in its place + // Operator= function + void operator=(const CTree &); + + vector > GetKnotClusters(vector IntNodeChanges, int ChangeRad = 2); // Function that takes an array[NoNodes()] of which branches change (or zero lengths, or whatever) + // and returns a set of clusters of overlapping changes of SNAP radius = ChangeRad +private: + // Private member variable + /////////////////////// + int m_iRootNode; // The root node (-1 if not rooted) + bool m_bRooted; // Whether the tree is rooted + int m_iNoNode; // The # of nodes in tree + int m_iMemNode; // Number of nodes stored in memory + int m_iNoBra; // The # of branches in tree + int m_iNoOptBra; // The # of branches optimised in the current tree + int m_iNoSeq; // The # of sequences in tree + int m_iStartCalc; // Leaf node from which to start calculations + vector m_vdBra; // The Branch length parameters + CNode ** m_Node; // The nodes in the tree + bool m_bReady; // Whether tree is ready + int m_iOutBra; // Out branch [0=no branches,1=branches,2=branch numbers] + int m_bOutName; // Out Names + bool m_bOutLabel; // Whether to output tree labels or not + vector m_vsName; // The names in the sequence + int **m_ariBraLinks; // the nodes linked to each branch [branch_number][links] + bool m_bOldTree; // Whether the tree has already been through a round of SNAP + bool m_bFastCalcOK; // Whether the tree is okay for FastCalc computations + bool m_bValid; // Flag to identify whether a valid constructor has been run + vector m_viBranchLabels; // Labels for different branch types (e.g. parameter per branch models) + int m_iNoBraLabels; // Number of unique branch labels + vector m_vSplits; // Vector containing the splits on a tree + + // Private member functions + //////////////////////////////////////////////////////////// + + // Find closest: gets closest nodes in a string + void find_closest(string *tree, int *c1, int *c2, int *p,int n_p, double *bra,int *label, int *IntVal); + void GetBraDetails(string s, int *node, double *bra, int *label); + ostream& OutNode(int FromNodeNum, int ToNode, ostream &os); + ostream& OutBranch(int ToNode, int FromNode, ostream &os); + double DoBranch(string *tree,int *pos, int *IntVal); + void BuildBranches(int NT = -1, int NF = -1); // Builds the branchs of a tree from nodes recursively + bool ValidateTree(bool AllowExit = true); + + //////////////////////////////////////////// + // Rearrangement functions + void BuildBraSA(CTree *T, int Seq, vector *TList, int OriBr, int SPRID, int MinDist, int MaxDist); + void Branch_SA(CTree *T, int Seq, vector *TList, int First, int NTo, int NFr, int OriBr, int SPRID, int MinDist, int MaxDist); + void DoSA(CTree *T, int Seq, vector *TList, int First, int NTo, int NFr, int Br, bool IsExtBra, int OriBr, int SPRID, int MinDist,int MaxDist); +}; + +bool IsSameTree(CTree *T1, CTree *T2); +void ExpandNode(int Node, string *String, int Stringpos, CTree *TREE); +int IsTreeGap(char Check); +// Functions for finding the greedy subtree +CTree FindGreedySubTree(CTree *FullTree, int NoSeq); // Driver function: uses the greedy algorithm to identify the optimal subtree +double TravAddGreedy(CTree *CurT, int To, int Fr, int Seq2Add, vector *PWDist, int *BestBra); // Function to traverse a tree and test which position maximises distance +double GetDist(int Add, int a, int b,vector *PWDist); // Check how much improvement a given sequence added to the tree would provide +void GreedySeq2Tree(int Bra,int Seq2Add, CTree *CurTree, vector *PWDists); // Adds the sequence to the tree + +bool SplitsCompatible(vector Splits1, int S1_seq, vector Splits2, int S2_seq); // Low level function just comparing a set of splits +bool CompareSplit(SSplit S1, SSplit S2); // Simple function for comparing splits + +vector ReadTreeNames(string Tree); + +#endif diff --git a/bionj.cxx b/bionj.cxx new file mode 100644 index 0000000..33fda6b --- /dev/null +++ b/bionj.cxx @@ -0,0 +1,713 @@ +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; ; +; BIONJ program ; +; ; +; Olivier Gascuel ; +; ; +; GERAD - Montreal- Canada ; +; olivierg@crt.umontreal.ca ; +; ; +; LIRMM - Montpellier- France ; +; gascuel@lirmm.fr ; +; ; +; UNIX version, written in C ; +; by Hoa Sien Cuong (Univ. Montreal) ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +// Test case 1 + +#include "Divvier.h" +#include "Tree.h" +#include +#include + +using namespace::std; + + +#define PREC 8 /* precision of branch-lengths */ +#define PRC 100 +#define LEN 10000 /* length of taxon names */ + +typedef struct word +{ + char name[LEN]; + struct word *suiv; +}WORD; + +typedef struct pointers +{ + WORD *head; + WORD *tail; +}POINTERS; + + +void Initialize(double **delta, FILE *input, int n, POINTERS *trees); + +void Compute_sums_Sx(double **delta, int n); + +void Best_pair(double **delta, int r, int *a, int *b, int n); + +string Finish(double **delta, int n, POINTERS *trees); + +void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post); + +string Print_output(int i, POINTERS *trees); + +double Distance(int i, int j, double **delta); + +double Variance(int i, int j, double **delta); + +double Sum_S(int i, double **delta); + +double Agglomerative_criterion(int i, int j, double **delta, int r); + +double Branch_length(int a, int b, double **delta, int r); + +double Reduction4(int a, double la, int b, double lb, int i, double lamda, + double **delta); + +double Reduction10(int a, int b, int i, double lamda, double vab, double + **delta); +double Lamda(int a, int b, double vab, double **delta, int n, int r); + +double Finish_branch_length(int i, int j, int k, double **delta); + +int Emptied(int i, double **delta); + +int Symmetrize(double **delta, int n); + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; ; +; Main program ; +; ; +; argc is the number of arguments ; +; **argv contains the arguments: ; +; the first argument has to be BIONJ; ; +; the second is the inptu-file; ; +; the third is the output-file. ; +; When the input and output files are ; +; not given, the user is asked for them. ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +string DoBioNJ(vector PWdists, vector Names, bool DoNumbers) { + POINTERS *trees; /* list of subtrees */ + WORD *name; /* Used for transferring sequence names */ + char *Name_fich1; /* name of the input file */ + char *Name_fich2; /* name of the output file */ + char *chain1; /* stringized branch-length */ + char *chain2; /* idem */ + int *a, *b; /* pair to be agglomerated */ + double la; /* first taxon�s branch-length */ + double lb; /* second taxon�s branch-length*/ + double vab; /* variance of Dab */ + double lamda; + double **delta; + int i,j,k,r,x,y,n = (int) Names.size(); + string return_tree; + + /* Allocation of memories */ + Name_fich1=(char*)calloc(LEN,sizeof(char)); + Name_fich2=(char*)calloc(LEN,sizeof(char)); + a=(int*)calloc(1,sizeof(int)); + b=(int*)calloc(1,sizeof(int)); + chain1=(char *)calloc(LEN,sizeof(char)); + chain2=(char *)calloc(LEN,sizeof(char)); + GET_MEM(delta,double*,n+1); FOR(i,n+1) { GET_MEM(delta[i],double,n+1); } + FOR(i,n+1){ delta[0][i] = 0.0; delta[i][0] = 0.0; } + k=0; for(i=1;i<=n;i++) { for(j=1;j<=n;j++) { delta[i][j] = PWdists[k++]; } } + + // Assign names + trees=(POINTERS *)calloc(n+1,sizeof(POINTERS)); + for(i = 1; i <= n; i++) { + name = (WORD *) calloc(1,sizeof(WORD)); + if(DoNumbers) { strcpy(name->name,int_to_string(i).c_str()); + } else { strcpy(name->name,Names[i-1].c_str()); } + assert(name != NULL); + name->suiv = NULL; + trees[i].head = name; + trees[i].tail = name; + } + /* initialise and symmetrize the running delta matrix */ + r=n; + *a=0; + *b=0; + + // Do bionj + while (r > 3) { /* until r=3 */ + Compute_sums_Sx(delta, n); /* compute the sum Sx */ + Best_pair(delta, r, a, b, n); /* find the best pair by */ + vab=Variance(*a, *b, delta); /* minimizing (1) */ + la=Branch_length(*a, *b, delta, r); /* compute branch-lengths */ + lb=Branch_length(*b, *a, delta, r); /* using formula (2) */ + lamda=Lamda(*a, *b, vab, delta, n, r); /* compute lambda* using (9)*/ + for(i=1; i <= n; i++) { + if(!Emptied(i,delta) && (i != *a) && (i != *b)) { + if(*a > i) { x=*a; y=i; } + else { x=i; y=*a; } /* apply reduction formulae */ + /* 4 and 10 to delta */ + delta[x][y]=Reduction4(*a, la, *b, lb, i, lamda, delta); + delta[y][x]=Reduction10(*a, *b, i, lamda, vab, delta); + } } + + strcpy(chain1,""); /* agglomerate the subtrees */ + strcat(chain1,"("); /* a and b together with the*/ + Concatenate(chain1, *a, trees, 0); /* branch-lengths according */ + strcpy(chain1,""); /* to the NEWSWICK format */ + strcat(chain1,":"); + sprintf(chain1+strlen(chain1),"%f",my_max(la,0)); + strcat(chain1,","); + Concatenate(chain1,*a, trees, 1); + trees[*a].tail->suiv=trees[*b].head; + trees[*a].tail=trees[*b].tail; + strcpy(chain1,""); + strcat(chain1,":"); + sprintf(chain1+strlen(chain1),"%f",my_max(lb,0)); + strcat(chain1,")"); + Concatenate(chain1, *a, trees, 1); + delta[*b][0]=1.0; /* make the b line empty */ + trees[*b].head=NULL; + trees[*b].tail=NULL; + r--; /* decrease r */ + } + return_tree = Finish(delta, n, trees); /* compute the branch-lengths*/ + for(i=1; i<=n; i++) { /* of the last three subtrees*/ + delta[i][0]=0.0; /* and print the tree in the */ + trees[i].head=NULL; /* output-file */ + trees[i].tail=NULL; + } + // Clear memory (added by SW) + free(trees); + free(a); free(b); free(Name_fich1); free(Name_fich2); + free(chain1); free(chain2); + FOR(i,n+1) { delete [] delta[i]; } delete [] delta; + return return_tree; +} + + +/*;;;;;;;;;;; INPUT, OUTPUT, INITIALIZATION ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; ; +; ; +; The delta matrix is read from the input-file. ; +; It is recommended to put it and the executable in ; +; a special directory. The input-file and output-file ; +; can be given as arguments to the executable by ; +; typing them after the executable (Bionj input-file ; +; output-file) or by typing them when asked by the ; +; program. The input-file has to be formated according ; +; the PHYLIP standard. The output file is formated ; +; according to the NEWSWICK standard. ; +; ; +; The lower-half of the delta matrix is occupied by ; +; dissimilarities. The upper-half of the matrix is ; +; occupied by variances. The first column ; +; is initialized as 0; during the algorithm some ; +; indices are no more used, and the corresponding ; +; positions in the first column are set to 1. ; +; ; +; This delta matix is made symmetrical using the rule: ; +; Dij = Dji <- (Dij + Dji)/2. The diagonal is set to 0; ; +; during the further steps of the algorithm, it is used ; +; to store the sums Sx. ; +; ; +; A second array, trees, is used to store taxon names. ; +; During the further steps of the algoritm, some ; +; positions in this array are emptied while the others ; +; are used to store subtrees. ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;; Initialize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function reads an input file and return the ; +; delta matrix and trees: the list of taxa. ; +; ; +; input : ; +; double **delta : delta matrix ; +; FILE *input : pointer to input file ; +; int n : number of taxa ; +; char **trees : list of taxa ; +; ; +; return value: ; +; double **delta : delta matrix ; +; char *trees : list of taxa ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +void Initialize(double **delta, FILE *input, int n, POINTERS *trees) +{ + int lig; /* matrix line */ + int col; /* matrix column */ + double distance; + char name_taxon[LEN]; /* taxon�s name */ + WORD *name; + + for(lig=1; lig <= n; lig++) + { + fscanf(input,"%s",name_taxon); /* read taxon�s name */ + name=(WORD *)calloc(1,sizeof(WORD)); /* taxon�s name is */ + if(name == NULL) /* put in trees */ + { + printf("Out of memories !!"); + exit(0); + } + else + { + strcpy(name->name,name_taxon); + name->suiv=NULL; + trees[lig].head=name; + trees[lig].tail=name; + for(col= 1; col <= n; col++) + { + fscanf(input,"%f",&distance); /* read the distance */ + delta[lig][col]=distance; + } + } + } +} + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Print_output;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; ; +; Description : This function prints out the tree in the output file. ; +; ; +; input : ; +; POINTERS *trees : pointer to the subtrees. ; +; int i : indicate the subtree i to be printed. ; +: FILE *output : pointer to the output file. ; +; ; +; return value: The phylogenetic tree in the output file. ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + + +string Print_output(int i, POINTERS *trees) +{ + stringstream out; + WORD *parcour; + parcour=trees[i].head; + while(parcour != NULL) + { + out << parcour->name; + parcour=parcour->suiv; + } + return out.str(); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Utilities ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Symmetrize ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function verifies if the delta matrix is symmetric; ; +; if not the matrix is made symmetric. ; +; ; +; input : ; +; double **delta : delta matrix ; +; int n : number of taxa ; +; ; +; return value: ; +; int symmetric : indicate if the matrix has been made ; +; symmetric or not ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +int Symmetrize(double **delta, int n) +{ + int lig; /* matrix line */ + int col; /* matrix column */ + double value; /* symmetrized value */ + int symmetric; + + symmetric=1; + for(lig=1; lig <= n; lig++) + { + for(col=1; col< lig; col++) + { + if(delta[lig][col] != delta[col][lig]) + { + value= (delta[lig][col]+delta[col][lig])/2; + delta[lig][col]=value; + delta[col][lig]=value; + symmetric=0; + } + } + } + if(!symmetric) + printf("The matrix is not symmetric"); + return(symmetric); +} + + + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;; Concatenate ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; ; +; Description : This function concatenates a string to another. ; +; ; +; input : ; +; char *chain1 : the string to be concatenated. ; +; int ind : indicate the subtree to which concatenate the ; +; string ; +; POINTERS *trees : pointer to subtrees. ; +; int post : position to which concatenate (front (0) or ; +; end (1)) ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +void Concatenate(char chain1[LEN], int ind, POINTERS *trees, int post) +{ + WORD *bran; + bran=(WORD *)calloc(1,sizeof(WORD)); + if(bran == NULL) + { + printf("Out of memories"); + exit(0); + } + else + { + strcpy(bran->name,chain1); + bran->suiv=NULL; + } + + + + if(post == 0) + { + bran->suiv=trees[ind].head; + trees[ind].head=bran; + } + else + { + trees[ind].tail->suiv=bran; + trees[ind].tail=trees[ind].tail->suiv; + } +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Distance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function retrieve ant return de distance between taxa ; +; i and j from the delta matrix. ; +; ; +; input : ; +; int i : taxon i ; +; int j : taxon j ; +; double **delta : the delta matrix ; +; ; +; return value: ; +; double distance : dissimilarity between the two taxa ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +double Distance(int i, int j, double **delta) +{ + if(i > j) + return(delta[i][j]); + else + return(delta[j][i]); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Variance;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function retrieve and return the variance of the ; +; distance between i and j, from the delta matrix. ; +; ; +; input : ; +; int i : taxon i ; +; int j : taxon j ; +; double **delta : the delta matrix ; +; ; +; return value: ; +; double distance : the variance of Dij ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +double Variance(int i, int j, double **delta) +{ + if(i > j) + return(delta[j][i]); + else + return(delta[i][j]); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Emptied ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function verifie if a line is emptied or not. ; +; ; +; input : ; +; int i : subtree (or line) i ; +; double **delta : the delta matrix ; +; ; +; return value: ; +; 0 : if not emptied. ; +; 1 : if emptied. ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +int Emptied(int i, double **delta) /* test if the ith line is emptied */ +{ + return((int)delta[i][0]); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Sum_S;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function retrieves the sum Sx from the diagonal ; +; of the delta matrix. ; +; ; +; input : ; +; int i : subtree i ; +; double **delta : the delta matrix ; +; ; +; return value: ; +; double delta[i][i] : sum Si ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +double Sum_S(int i, double **delta) /* get sum Si form the diagonal */ +{ + return(delta[i][i]); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;Compute_sums_Sx;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function computes the sums Sx and store them in the ; +; diagonal the delta matrix. ; +; ; +; input : ; +; double **delta : the delta matrix. ; +; int n : the number of taxa ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +void Compute_sums_Sx(double **delta, int n) +{ + double sum = 0; + int i; + int j; + + for(i= 1; i <= n ; i++) + { + if(!Emptied(i,delta)) + { + sum=0; + for(j=1; j <=n; j++) + { + if(i != j && !Emptied(j,delta)) /* compute the sum Si */ + sum=sum + Distance(i,j,delta); + } + } + delta[i][i]=sum; /* store the sum Si in */ + } /* delta�s diagonal */ +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Best_pair;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function finds the best pair to be agglomerated by ; +; minimizing the agglomerative criterion (1). ; +; ; +; input : ; +; double **delta : the delta matrix ; +; int r : number of subtrees ; +; int *a : contain the first taxon of the pair ; +; int *b : contain the second taxon of the pair ; +; int n : number of taxa ; +; ; +; return value: ; +; int *a : the first taxon of the pair ; +; int *b : the second taxon of the pair ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +void Best_pair(double **delta, int r, int *a, int *b, int n) +{ + double Qxy; /* value of the criterion calculated*/ + int x,y; /* the pair which is tested */ + double Qmin; /* current minimun of the criterion */ + + Qmin=BIG_NUMBER; + for(x=1; x <= n; x++) + { + if(!Emptied(x,delta)) + { + for(y=1; y < x; y++) + { + if(!Emptied(y,delta)) + { + Qxy=Agglomerative_criterion(x,y,delta,r); + if(Qxy < Qmin-0.000001) + { + Qmin=Qxy; + *a=x; + *b=y; + } + } + } + } + } +} + + +/*;;;;;;;;;;;;;;;;;;;;;;Finish_branch_length;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : Compute the length of the branch attached ; +; to the subtree i, during the final step ; +; ; +; input : ; +; int i : position of subtree i ; +; int j : position of subtree j ; +; int k : position of subtree k ; +; double **delta : ; +; ; +; return value: ; +; double length : The length of the branch ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +double Finish_branch_length(int i, int j, int k, double **delta) +{ + double length; + length=0.5*(Distance(i,j,delta) + Distance(i,k,delta) + -Distance(j,k,delta)); + return length; +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;Finish;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Description : This function compute the length of the lasts three ; +; subtrees and write the tree in the output file. ; +; ; +; input : ; +; double **delta : the delta matrix ; +; int n : the number of taxa ; +; WORD *trees : list of subtrees ; +; ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + +string Finish(double **delta, int n, POINTERS *trees) { + int l=1; + int i=0; + double length; + char *str; + WORD *bidon; + WORD *ele; + int last[3]; /* the last three subtrees */ + stringstream out; + + str=(char *)calloc(LEN,sizeof(char)); + + if(str == NULL) { printf("Out of memories !!"); exit(0); } + while(l <= n) { /* find the last tree subtree */ + if(!Emptied(l, delta)) { last[i]=l; i++; } + l++; + } + length=Finish_branch_length(last[0],last[1],last[2],delta); + out << "(" << Print_output(last[0],trees) << ":" << my_max(0,length) << ","; + length=Finish_branch_length(last[1],last[0],last[2],delta); + out << Print_output(last[1],trees) << ":" << my_max(0,length) << ","; // fprintf(output,":"); fprintf(output,"%f,",length); + length=Finish_branch_length(last[2],last[1],last[0],delta); + out << Print_output(last[2],trees) << ":" << my_max(0,length) << ");"; + FOR(i,3) { + bidon=trees[last[i]].head; + ele=bidon; + while(bidon!=NULL) + { + ele=ele->suiv; + free(bidon); + bidon=ele; + } + } + free(str); + return out.str(); +} + + +/*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*\ +; ; +; Formulae ; +; ; +\*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/ + + +double Agglomerative_criterion(int i, int j, double **delta, int r) +{ + double Qij; + Qij=(r-2)*Distance(i,j,delta) /* Formula (1) */ + -Sum_S(i,delta) + -Sum_S(j,delta); + + return(Qij); +} + + +double Branch_length(int a, int b, double **delta, int r) +{ + double length; + length=0.5*(Distance(a,b,delta) /* Formula (2) */ + +(Sum_S(a,delta) + -Sum_S(b,delta))/(r-2)); + return(length); +} + + +double Reduction4(int a, double la, int b, double lb, int i, double lamda, + double **delta) +{ + double Dui; + Dui=lamda*(Distance(a,i,delta)-la) + +(1-lamda)*(Distance(b,i,delta)-lb); /* Formula (4) */ + return(Dui); +} + + +double Reduction10(int a, int b, int i, double lamda, double vab, + double **delta) +{ + double Vci; + Vci=lamda*Variance(a,i,delta)+(1-lamda)*Variance(b,i,delta) + -lamda*(1-lamda)*vab; /*Formula (10) */ + return(Vci); +} + +double Lamda(int a, int b, double vab, double **delta, int n, int r) +{ + double lamda=0.0; + int i; + + if(vab==0.0) + lamda=0.5; + else + { + for(i=1; i <= n ; i++) + { + if(a != i && b != i && !Emptied(i,delta)) + lamda=lamda + (Variance(b,i,delta) - Variance(a,i,delta)); + } + lamda=0.5 + lamda/(2*(r-2)*vab); + } /* Formula (9) and the */ + if(lamda > 1.0) /* constraint that lamda*/ + lamda = 1.0; /* belongs to [0,1] */ + if(lamda < 0.0) + lamda=0.0; + return(lamda); +} + + + + + diff --git a/hmm.c b/hmm.c index eebb5e7..00e94a2 100644 --- a/hmm.c +++ b/hmm.c @@ -130,7 +130,7 @@ double *YCbmatrix; double pxy; -double *posterior; +double *zorro_posterior; int *sample; int *Xpos; int *Ypos; @@ -156,11 +156,9 @@ int BOUND_SIZE; // The active one int BOUND_ATTEMPTS = 3; // Number of attempts at bounded prob before aborting and doing full calc double bound_error_threshold = 0.005; // Acceptable probability on pairHMM edge; this has the greatest effect on PP stability (0.0001 seem very stable) ///////////////////////////////////////////////////////////////// - - -void calc_posterior(int len) { - int i, j; - posterior = (double *) (malloc((len) * sizeof(double))); +void calc_prep(int len) { + int i; + zorro_posterior = (double *) (malloc((len) * sizeof(double))); sample = (int *) (malloc((len) * sizeof(int))); Xpos = (int *) (malloc((len) * sizeof(int))); Ypos = (int *) (malloc((len) * sizeof(int))); @@ -170,9 +168,13 @@ void calc_posterior(int len) { leftBounds = (int *) malloc((len) * sizeof(int)); rightBounds = (int *) malloc((len) * sizeof(int)); for (i = 0; i < len; i++) { - posterior[i] = 0.0; - sample[i] = 0; + zorro_posterior[i] = 0.0; } +} + +void calc_posterior(int len) { + int i ,j; + calc_prep(len); TOT_DIST = 0.0; for (i = 0; i < Nseq; i++) { for (j = i + 1; j < Nseq; j++) { @@ -217,8 +219,8 @@ bool MakePosteriors(int X, int Y, bool Flip) { // printf("\nDoing posterior %d %d\n", X, Y); fflush(stdout); // Build the bounds and run the pairHMM BuildBounds(X, Y, Flip); - approx_forward(sequence[X], sequence[Y], lens[X], lens[Y]); - approx_backward(sequence[X], sequence[Y], lens[X], lens[Y]); + approx_forward(zorro_sequence[X], zorro_sequence[Y], lens[X], lens[Y]); + approx_backward(zorro_sequence[X], zorro_sequence[Y], lens[X], lens[Y]); // Goes through the sequence and checks there isn't much probability mass on the non-edge bounds for (i = 1; i <= lens[X]; i++) { @@ -239,6 +241,16 @@ bool MakePosteriors(int X, int Y, bool Flip) { return true; } +// Modified version of addPosterior +void getSinglePosterior(int X, int Y) { + // Initialise the posterior to zero + for (int i = 0; i < alen; i++) { + zorro_posterior[i] = 0.0; + } + // Normal call + addPosterior(X, Y); +} + #define DEBUG_PP 0 void addPosterior(int X, int Y) { int i, Xp, Yp; @@ -247,8 +259,8 @@ void addPosterior(int X, int Y) { // First build the map of aln_position -> sequence_position Xp = Yp = 0; for (i = 0; i < alen; i++) { - if (align[X][i] == 20) { - if (align[Y][i] == 20) { + if (zorro_align[X][i] == 20) { + if (zorro_align[Y][i] == 20) { states[i] = -1; } else { Yp++; @@ -256,7 +268,7 @@ void addPosterior(int X, int Y) { } } else { Xp++; - if (align[Y][i] == 20) { + if (zorro_align[Y][i] == 20) { states[i] = 1; } else { states[i] = 0; @@ -269,8 +281,8 @@ void addPosterior(int X, int Y) { int *posX = Xpos, *posY = Ypos; if (!DO_APPROX) { - forward(sequence[X], sequence[Y], lens[X], lens[Y]); - backward(sequence[X], sequence[Y], lens[X], lens[Y]); + forward(zorro_sequence[X], zorro_sequence[Y], lens[X], lens[Y]); + backward(zorro_sequence[X], zorro_sequence[Y], lens[X], lens[Y]); } else { if (RunMakePosteriors(X, Y)) { // printf("Done flip on %d,%d\n",X,Y); @@ -291,28 +303,28 @@ void addPosterior(int X, int Y) { sample[i]++; switch (states[i]) { case -1: - posterior[i] += 0.0; + zorro_posterior[i] += 0.0; sample[i]--; - printf("%s ", "NA"); +// printf("%s ", "NA"); break; case 0: f = exp(Mfmatrix[posX[i]][posY[i]] + Mbmatrix[posX[i]][posY[i]] - pxy); //printf("%d %d : %f %f %f\n",Xpos[i],Ypos[i],Mfmatrix[Xpos[i]][Ypos[i]],Mbmatrix[Xpos[i]][Ypos[i]],Mfmatrix[Xpos[i]][Ypos[i]]+Mbmatrix[Xpos[i]][Ypos[i]]-pxy); - printf("%f ", f); - posterior[i] += f; +// printf("%f ", f); + zorro_posterior[i] += f; break; case 1: f = exp(Xfmatrix[posX[i]][posY[i]] + Xbmatrix[posX[i]][posY[i]] - pxy); - printf("%s ", "NA"); +// printf("%s ", "NA"); // printf("%f ", f); - posterior[i] += f; + zorro_posterior[i] += f; break; case 2: f = exp(Yfmatrix[posX[i]][posY[i]] + Ybmatrix[posX[i]][posY[i]] - pxy); - printf("%s ", "NA"); +// printf("%s ", "NA"); // printf("%f ", f); - posterior[i] += f; + zorro_posterior[i] += f; break; default: error( @@ -320,7 +332,7 @@ void addPosterior(int X, int Y) { } } - printf("\n"); +// printf("\n"); posX = NULL; posY = NULL; } @@ -896,7 +908,6 @@ void backward(char *seqX,char *seqY,int lenX,int lenY){ void BuildBounds(int X, int Y, bool Flip) { assert(lens[X] <= lens[Y]); - int diff = lens[Y] - lens[X]; int *posX = Xpos, *posY = Ypos; if(Flip) { posX = Ypos; posY = Xpos; } diff --git a/hmm.h b/hmm.h index ef503c0..2c50355 100644 --- a/hmm.h +++ b/hmm.h @@ -43,7 +43,9 @@ EXTERN double selfC; EXTERN double emitSingleDefault[20]; EXTERN double emitPairsDefault[20][20]; - +EXTERN double *zorro_posterior; void initHMM(int len); +void calc_prep(int len); void calc_posterior(int len); +void getSinglePosterior(int X, int Y); // Calculate the posterior and store it in double * posterior diff --git a/trim.c b/trim.c index 0d9f62d..91ceb39 100644 --- a/trim.c +++ b/trim.c @@ -19,16 +19,7 @@ #undef EXTERN -int main(int argc,char **argv){ - if(argc < 2){ - printf("Input file not specified\n"); - exit(EXIT_FAILURE); - } - - clock_t start, end; - double cpu_time_used; - start = clock(); - +void ReadAndPrepZorro(char * file){ // Hard coded options JTT = 0; @@ -36,20 +27,11 @@ int main(int argc,char **argv){ PAM = 1; MATRICES = 0; - readSeq(argv[argc-1]); - - printf("%d %d\n",Nseq,alen); - for(int i = 0 ; i < Nseq ; i++) { - printf("%s\n",names[i]); - } + readSeq(file); initHMM(alen); - calc_posterior(alen); - - end = clock(); - cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; + calc_prep(alen); + // calc_posterior(alen); - fprintf(stderr,"CPU time used: %.2f seconds\n",cpu_time_used); - return 1; } diff --git a/trim.h b/trim.h index 4db8854..b666c45 100644 --- a/trim.h +++ b/trim.h @@ -6,3 +6,4 @@ #define N_PEPT 20 #define MAX_LABEL_LEN 20 +void ReadAndPrepZorro(char *file); // Initialisation function for zorro. After this the HMMs are ready for RunMakePosteriors(int X, int Y) diff --git a/utils.c b/utils.c index eccc711..c8a69c4 100644 --- a/utils.c +++ b/utils.c @@ -30,9 +30,10 @@ int pep2num(char c); char *removeGaps(char *seq,int len,int *nlen); int Nseq; -char **align; -char **sequence; -char **names; +char ** zorro_raw_seq; +char **zorro_align; +char **zorro_sequence; +char **zorro_names; int *lens; int alen; double TOT_DIST; @@ -57,42 +58,52 @@ void error(char *fmt, ... ) { int readSeq(char *inFile){ char *seq; int len,i; - int tmp = 0; + int tmp = 0, full_seq; FILE *fp1,*fp2; int pflag; if((fp1 = fopen(inFile,"r")) == NULL){ error((char *)"readSeq: can't open %s for read, exiting\n",inFile); } // Calculate the number of sequences - tmp = 0; + full_seq = tmp = 0; while(fgets(line,MAX_LINE_LEN,fp1) != NULL){ if(line[0] == '>'){ tmp++; } } + full_seq = tmp; fclose(fp1); fp1 = fopen(inFile,"r"); fp2 = fopen(inFile,"r"); Nseq = 0; alen = -1; - align = (char **)(malloc(tmp * sizeof(char *))); - sequence = (char **)(malloc(tmp * sizeof(char *))); - names = (char **)(malloc(tmp * sizeof(char *))); + zorro_raw_seq = (char **)(malloc(tmp * sizeof(char *))); + zorro_align = (char **)(malloc(tmp * sizeof(char *))); + zorro_sequence = (char **)(malloc(tmp * sizeof(char *))); + zorro_names = (char **)(malloc(tmp * sizeof(char *))); lens = (int *)(malloc(tmp * sizeof(int))); while(1){ - seq = readNextSeq(inFile,&len,fp1,fp2,&pflag,names+Nseq); + seq = readNextSeq(inFile,&len,fp1,fp2,&pflag,zorro_names+Nseq); if(len < 0){ break; } if(alen == -1){ alen = len; + for(int k = 0 ; k < full_seq ; k++) { + zorro_align[k] = (char*) ( malloc(alen * sizeof(char*)) ); + zorro_raw_seq[k] = (char*) ( malloc((alen + 1) * sizeof(char*)) ); + } } else if(len != alen){ error("Wrong alignment\n"); } //fprintf(stderr,">seq%d\n%s\n",Nseq,seq); - align[Nseq] = seq; + // SW hacked so input is cleaner + for(int k = 0; k < alen; k++) { + zorro_raw_seq[Nseq][k] = seq[k]; + zorro_align[Nseq][k] = pep2num(seq[k]); + } Nseq++; } if(tmp != Nseq){ @@ -100,10 +111,10 @@ int readSeq(char *inFile){ } fclose(fp1); fclose(fp2); - fprintf(stderr,"%d sequences, Length %d\n",Nseq,alen); - +// fprintf(stderr,"%d sequences, Length %d\n",Nseq,alen); + for(i=0;i