Skip to content
Browse files

Merge branch 'cpp11' into common_cpp

trying to unify c++98 and c++11 codebases in a single branch.

Conflicts:
	CMakeLists.txt
	aligned_buffer.h
	papara.h
	stepwise_addition.cpp
	stepwise_addition_pro.cpp
	vec_unit.h
  • Loading branch information...
2 parents c4f821d + da44e72 commit 92703404d1cd170b485bcf293d49b7ccf9d8011c @sim82 committed Nov 20, 2012
Showing with 2,682 additions and 278 deletions.
  1. +37 −63 CMakeLists.txt
  2. +5 −2 align_vec.h
  3. +52 −0 dump_anc_probs.cpp
  4. +1 −1 genassign_blast
  5. +1 −1 ivy_mike
  6. +9 −0 pairwise_seq_distance.cpp
  7. +5 −2 pairwise_seq_distance.h
  8. +1 −1 papara.h
  9. +1 −1 pars_align_gapp_seq.h
  10. +1 −1 phy_to_fasta.cpp
  11. +496 −96 propara.cpp
  12. +7 −2 pvec.h
  13. +46 −36 raxml_interface.cpp
  14. +4 −0 raxml_interface.h
  15. +12 −0 sequence_model.cpp
  16. +79 −0 sequence_model.h
  17. +3 −1 stepwise_addition.cpp
  18. +1,810 −43 stepwise_addition_pro.cpp
  19. +5 −5 stepwise_align.h
  20. +46 −4 testbench.cpp
  21. +32 −1 tree_utils.h
  22. +29 −18 vec_unit.h
View
100 CMakeLists.txt
@@ -14,33 +14,41 @@ add_subdirectory(ublasJama-1.0.2.3)
# this is a hack: only add subdirectories if IVYMIKE_INCLUDE is not already set.
# IVYMIKE_INCLUDE is abused as a flag indicating that papara should be build in 'library-only' mode...
add_subdirectory(ivy_mike)
-add_subdirectory(genassign_blast)
+# add_subdirectory(genassign_blast)
# ENDIF()
include_directories( ${include_directories} ${IVYMIKE_INCLUDE} . ublasJama-1.0.2.3 )
+set( USE_CPP11 no )
+
# handle the stupid autolinking shit of visualc++
IF(WIN32)
-IF( NOT BOOST_ROOT)
-set(BOOST_ROOT C:\\src\\boost_1_48_0)
-ENDIF()
-include_directories( ${BOOST_ROOT} )
-#LINK_DIRECTORIES( ${LINK_DIRECTORIES} ${BOOST_ROOT}\\stage\\lib)
-set( BOOST_LIBS )
-set( SYSDEP_LIBS )
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_SCL_SECURE_NO_WARNINGS /INCREMENTAL:NO") # as long as there is no support for std::array, these warnings are plain stupid!
+ IF( NOT BOOST_ROOT)
+ set(BOOST_ROOT C:\\src\\boost_1_48_0)
+ ENDIF()
+
+ include_directories( ${BOOST_ROOT} )
+ #LINK_DIRECTORIES( ${LINK_DIRECTORIES} ${BOOST_ROOT}\\stage\\lib)
+ set( BOOST_LIBS )
+ set( SYSDEP_LIBS )
+ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_SCL_SECURE_NO_WARNINGS /INCREMENTAL:NO") # as long as there is no support for std::array, these warnings are plain stupid!
-file(GLOB ALL_HEADERS *.h)
+ file(GLOB ALL_HEADERS *.h)
-ELSE()
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98 -pedantic -Wall -march=native")
-set( BOOST_LIBS boost_thread boost_program_options)
-set(SYSDEP_LIBS pthread)
-#LINK_DIRECTORIES( ${LINK_DIRECTORIES} /usr/lib64/atlas-sse2 )
+ELSE(WIN32)
-set( ALL_HEADERS )
-ENDIF()
+ if( USE_CPP11 )
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -pedantic -Wall -march=native")
+ else()
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98 -pedantic -Wall -march=native")
+ endif()
+ set( BOOST_LIBS boost_thread boost_program_options)
+ set(SYSDEP_LIBS pthread)
+ #LINK_DIRECTORIES( ${LINK_DIRECTORIES} /usr/lib64/atlas-sse2 )
+
+ set( ALL_HEADERS )
+ENDIF(WIN32)
@@ -53,52 +61,18 @@ add_executable(fasta_to_phy fasta_to_phy.cpp ${ALL_HEADERS})
add_executable(phy_to_fasta phy_to_fasta.cpp ${ALL_HEADERS})
add_executable(phy_cut_partition phy_cut_partition.cpp ${ALL_HEADERS})
-# add_executable(pw_dist pw_dist.cpp pairwise_seq_distance.cpp )
-
-# target_link_libraries(papara_nt ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(papara papara_core ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(phy_to_fasta ivymike ${SYSDEP_LIBS} )
target_link_libraries(phy_cut_partition papara_core ublas_jama ivymike ${SYSDEP_LIBS} )
-# target_link_libraries(pw_dist ivymike ${BOOST_LIBS} ${SYSDEP_LIBS})
-
-
-# more experimental stuff that I don't care to force through microsoft's interpretation of c++...
-# also that stuff externally only depends on boost headers, which removes most windows related P.I.T.A.
-# if( NOT WIN32 )
-# add_executable(tree_similarity tree_similarity.cpp call_main2.cpp)
-# add_executable(propara propara.cpp pvec.cpp raxml_interface.cpp tree_similarity.cpp parsimony.cpp )
-# add_executable(smith_waterman smith_waterman.cpp )
-# add_executable(stepwise_addition stepwise_addition.cpp pars_align_seq.cpp pairwise_seq_distance.cpp tree_similarity.cpp parsimony.cpp )
-# add_executable(stepwise_addition_gapp stepwise_addition_gapp.cpp pvec.cpp pars_align_seq.cpp pairwise_seq_distance.cpp tree_similarity.cpp parsimony.cpp raxml_interface.cpp )
-# add_executable(stepwise_addition_pro stepwise_addition_pro.cpp pvec.cpp pairwise_seq_distance.cpp tree_similarity.cpp parsimony.cpp raxml_interface.cpp sequence_model.cpp )
-# add_executable(epa_extract_qs_covered epa_extract_qs_covered.cpp )
-#
-#
-# add_executable(tbb_test tbb.cpp )
-# add_executable(dtw dtw.cpp )
-# add_executable(inherit_test inherit_test.cpp )
-# add_executable(testbench testbench.cpp pars_align_seq.cpp )
-# # add_executable(atlas_test atlas.cpp )
-#
-# add_executable(test_bitset test_bitset.cpp)
-# #add_executable(endian_benchmark endian_benchmark.cpp)
-#
-# target_link_libraries(tree_similarity ivymike)
-# target_link_libraries(propara ivymike ${SYSDEP_LIBS} PocoFoundation ublas_jama ${BOOST_LIBS})
-# #target_link_libraries(stepwise_addition ivymike)
-# target_link_libraries(stepwise_addition ivymike ${BOOST_LIBS} ${SYSDEP_LIBS})
-# target_link_libraries(stepwise_addition_gapp ivymike ublas_jama ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation)
-# target_link_libraries(stepwise_addition_pro ivymike ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation)
-#
-# #target_link_libraries( atlas_test ivymike ${BOOST_LIBS} atlas cblas )
-# target_link_libraries(tbb_test ublas_jama ivymike )
-# target_link_libraries(testbench ${BOOST_LIBS} )
-# target_link_libraries(epa_extract_qs_covered ivymike)
-#
-# target_link_libraries(dtw ivymike)
-# target_link_libraries(inherit_test ivymike)
-#
-# target_link_libraries(test_bitset ivymike)
-# #target_link_libraries(endian_benchmark ivymike)
-#
-# endif()
+
+
+if( USE_CPP11 )
+ add_executable(stepwise_addition_pro stepwise_addition_pro.cpp pvec.cpp pairwise_seq_distance.cpp tree_similarity.cpp parsimony.cpp raxml_interface.cpp sequence_model.cpp ${ALL_HEADERS} )
+ add_executable(propara propara.cpp pvec.cpp raxml_interface.cpp tree_similarity.cpp parsimony.cpp )
+ add_executable(dump_anc_probs dump_anc_probs.cpp raxml_interface.cpp tree_similarity.cpp )
+
+ target_link_libraries(stepwise_addition_pro ivymike ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation ublas_jama)
+ target_link_libraries(propara papara_core ivymike ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation ublas_jama)
+ target_link_libraries(dump_anc_probs ivymike ublas_jama ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation)
+endif()
+
View
7 align_vec.h
@@ -21,8 +21,11 @@
#define __align_vec_h
#include "vec_unit.h"
-#include "aligned_buffer.h"
-#include "fasta.h"
+#include "ivymike/aligned_buffer.h"
+#include "ivymike/fasta.h"
+
+using ivy_mike::aligned_buffer;
+using ivy_mike::scoring_matrix;
template <class score_t>
struct persistent_state {
View
52 dump_anc_probs.cpp
@@ -0,0 +1,52 @@
+
+#include <iostream>
+#include <algorithm>
+#include <iterator>
+#include <iomanip>
+#include <boost/numeric/ublas/matrix.hpp>
+#include "raxml_interface.h"
+
+
+
+int main( int argc, char * argv[] ) {
+
+ if( argc != 2 ) {
+ std::cerr << "missing parameters: file name\n";
+ return 1;
+ }
+ char *file_name = argv[1];
+
+ std::ifstream is( file_name, std::ios::binary );
+
+ auto pvecs = read_binary_anc_probs(is);
+
+ for( auto & pv : pvecs )
+// auto &pv = pvecs.at(7);
+ {
+// std::cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n";
+
+
+ for( auto it = pv.begin2(); it != pv.end2(); ++it ) {
+ size_t num = std::count_if(it.begin(), it.end(), [](double a){return a > 0.255;} );
+
+ if( !true ) {
+ if( num > 1 ) {
+ std::cout << "meeeep\n";
+
+ }
+ } else {
+
+ //std::cout << num << " ";
+ std::cout << std::distance( pv.begin2(), it ) << " ";
+ std::for_each( it.begin(), it.end(), [&](double v) { std::cout << std::setw(14) << v; } );
+ std::cout << "\n";
+ }
+ }
+
+
+
+// break; // stop after first vector
+ }
+
+
+}
2 genassign_blast
@@ -1 +1 @@
-Subproject commit 4ede83da89bde865c6e58651fe23bdb196d70ff2
+Subproject commit c2c3622dfdfcf886e6759ca6d1856b06cc46c5c3
2 ivy_mike
@@ -1 +1 @@
-Subproject commit cfec9a56fb18ff3e4b77e546c3deb68ba583df5e
+Subproject commit 5626330f892acfa12c3d63015d6c53b3de85af37
View
9 pairwise_seq_distance.cpp
@@ -17,6 +17,9 @@
* along with papara. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <iostream>
+#include <fstream>
+
#include <algorithm>
#include <deque>
@@ -40,6 +43,8 @@ namespace timpl = ivy_mike;
#include "ivymike/time.h"
#include "ivymike/write_png.h"
#include "ivymike/tdmatrix.h"
+#include "ivymike/fasta.h"
+#include "ivymike/aligned_buffer.h"
#ifndef PWDIST_INLINE
// this means this file is not included by pairwise_seq_distance.h itself...
@@ -53,6 +58,10 @@ namespace timpl = ivy_mike;
#define PSD_DECLARE_INLINE
#endif
+
+using ivy_mike::scoring_matrix;
+using ivy_mike::aligned_buffer;
+
//typedef boost::multi_array<int,2> pw_score_matrix;
typedef ivy_mike::tdmatrix<int> pw_score_matrix;
View
7 pairwise_seq_distance.h
@@ -31,11 +31,14 @@
#include <stdint.h>
#include "ivymike/tdmatrix.h"
+namespace ivy_mike {
class scoring_matrix;
-bool pairwise_seq_distance( const std::vector< std::vector<uint8_t> > &seq_raw1, const std::vector< std::vector<uint8_t> > &seq_raw2, bool identical, ivy_mike::tdmatrix<int> &out_scores, const scoring_matrix &sm, const int gap_open, const int gap_extend, const size_t n_thread, const size_t block_size );
+}
+
+bool pairwise_seq_distance( const std::vector< std::vector<uint8_t> > &seq_raw1, const std::vector< std::vector<uint8_t> > &seq_raw2, bool identical, ivy_mike::tdmatrix<int> &out_scores, const ivy_mike::scoring_matrix &sm, const int gap_open, const int gap_extend, const size_t n_thread, const size_t block_size );
#endif
-inline bool pairwise_seq_distance( const std::vector< std::vector<uint8_t> > &seq_raw, ivy_mike::tdmatrix<int> &out_scores, const scoring_matrix &sm, const int gap_open, const int gap_extend, const size_t n_thread, const size_t block_size ) {
+inline bool pairwise_seq_distance( const std::vector< std::vector<uint8_t> > &seq_raw, ivy_mike::tdmatrix<int> &out_scores, const ivy_mike::scoring_matrix &sm, const int gap_open, const int gap_extend, const size_t n_thread, const size_t block_size ) {
return pairwise_seq_distance( seq_raw, seq_raw, true, out_scores, sm, gap_open, gap_extend, n_thread, block_size );
}
View
2 papara.h
@@ -703,7 +703,7 @@ class ref_gap_collector {
// TODO: shouldn't it be possible to infer the state_type from oiter?
template<typename iiter, typename oiter, typename state_type>
void transform( iiter istart, iiter iend, oiter ostart, state_type gap ) const {
- assert( std::distance(istart, iend) == ref_gaps_.size() - 1 );
+ assert( std::distance(istart, iend) == ptrdiff_t( ref_gaps_.size() - 1) );
size_t i = 0;
while( istart != iend ) {
View
2 pars_align_gapp_seq.h
@@ -26,7 +26,7 @@ class pars_align_gapp_seq {
typedef unsigned int pars_state_t;
typedef double score_t;
- const static score_t LARGE_VALUE; // = 32000; // score_t must be able to keep LARGE_VALUE + GAP_OPEN + GAP_EXTEND without overflowing!. waiting for c++0x to be able to use std::numeric_limits at compile-time...
+ const static score_t LARGE_VALUE;// = 32000; // score_t must be able to keep LARGE_VALUE + GAP_OPEN + GAP_EXTEND without overflowing!. waiting for c++0x to be able to use std::numeric_limits at compile-time...
const score_t GAP_OPEN; //= 1;
const score_t GAP_EXTEND; // = 1;
const score_t GAP_OPEN_EXTEND;
View
2 phy_to_fasta.cpp
@@ -10,7 +10,7 @@ static bool nogap( char c ) {
return c != '-' && c != 'N';
}
-#if __cplusplus >= 199711L
+#if __cplusplus <= 199711L
namespace std {
// OH NOES, copy_if is c++11... rip it from gnu
template<typename _InputIterator, typename _OutputIterator,
View
592 propara.cpp
@@ -49,7 +49,7 @@
#include "parsimony.h"
#include "pvec.h"
#include "align_utils.h"
-#include "fasta.h"
+#include "ivymike/fasta.h"
#include "raxml_interface.h"
#include "math_approx.h"
@@ -64,6 +64,13 @@
#include "ivymike/algorithm.h"
#include "ivymike/smart_ptr.h"
+#include "papara.h"
+#include "sequence_model.h"
+
+namespace papara {
+log_stream lout;
+}
+
using ivy_mike::tree_parser_ms::lnode;
using ivy_mike::tree_parser_ms::ln_pool;
using ivy_mike::tree_parser_ms::node_data_factory;
@@ -80,7 +87,9 @@ namespace {
typedef boost::iostreams::tee_device<std::ostream, std::ofstream> log_device;
typedef boost::iostreams::stream<log_device> log_stream;
-log_stream lout;
+//log_stream lout;
+
+using papara::lout;
template<typename stream_, typename device_>
class bios_open_guard {
@@ -142,8 +151,8 @@ typedef bios_open_guard<log_stream, log_device> log_stream_guard;
namespace {
-const double g_delta = 0.1;
-const double g_epsilon = 0.8;
+const double g_delta = 0.0001;
+const double g_epsilon = 0.001;
const double g_gap_freq = 0.715;
@@ -162,7 +171,7 @@ class log_odds {
const double bg_prob_;
};
-
+#if 0
class log_odds_aligner_score_only {
@@ -377,7 +386,7 @@ class log_odds_aligner_score_only {
double max_score_;
};
-
+#endif
class odds {
public:
@@ -407,12 +416,12 @@ class log_odds_viterbi {
public:
log_odds_viterbi( const dmat &state, const dsvec &gap, boost::array<double,4> state_freq )
- : ref_state_prob_(state), ref_gap_prob_(gap), ref_len_(state.size1()),
+ : ref_state_prob_(state), ref_gap_prob_(gap), ref_len_(state.size2()),
state_freq_(state_freq),
neg_inf_( -std::numeric_limits<lof_t>::infinity() ),
- m_(state.size1() + 1),
- d_(state.size1() + 1),
- i_(state.size1() + 1),
+ m_(ref_len_ + 1),
+ d_(ref_len_ + 1),
+ i_(ref_len_ + 1),
max_matrix_height_(0),
delta_(g_delta),
epsilon_(g_epsilon)
@@ -421,6 +430,9 @@ class log_odds_viterbi {
}
void setup( size_t qlen ) {
+
+// std::cerr << "len: " << ref_gap_prob_.size() << " " << ref_len_ << "\n";
+
assert( ref_gap_prob_.size() == ref_len_ );
@@ -441,10 +453,10 @@ class log_odds_viterbi {
void precalc_log_odds() {
- ref_state_lo_.resize( ref_state_prob_.size2(), ref_state_prob_.size1() );
+ ref_state_lo_.resize( ref_state_prob_.size1(), ref_state_prob_.size2() );
for( size_t i = 0; i < 4; ++i ) {
- const ublas::matrix_column<dmat> pcol( ref_state_prob_, i );
+ const ublas::matrix_row<dmat> pcol( ref_state_prob_, i );
ublas::matrix_row<lomat> lorow( ref_state_lo_, i );
std::transform( pcol.begin(), pcol.end(), lorow.begin(), log_odds(state_freq_[i]));
}
@@ -481,8 +493,11 @@ class log_odds_viterbi {
assert( m_.size() == ref_len_ + 1 );
-
-
+ const bool verbose = false;
+ if( verbose ) {
+ std::cout << "viterbi:\n";
+ }
+
for( size_t i = 1; i < qlen + 1; ++i ) {
const int b = qs[i-1];
// std::cout << "b: " << b << "\n";
@@ -532,6 +547,9 @@ class log_odds_viterbi {
+ exp(diag_d) * gap_odds
+ exp(diag_i) * gap_odds
);
+
+// std::cout << m_log_sum << " " << m_log_sum2 << "\n";
+
#else
lof_t m_log_sum = diag_m + math_approx::log(
ngap_odds
@@ -542,7 +560,12 @@ class log_odds_viterbi {
diag_m = *m0;
*m0 = m_log_sum + match_log_odds;
+ if( verbose ) {
+ std::cout << std::setw(10) << *m0 << std::setw(10) << diag_d << std::setw(10) << diag_i << ";";
+ // std::cout << std::setw(10) << match_log_odds;
+ }
+ assert( std::isfinite(*m0) );
#if 0
std::cout << i << " " << j << " " << m_(i,j) << " : " << m_(i-1, j-1) + ngap_log_odds
<< " " << d_(i-1, j-1) + gap_log_odds << " " << i_(i-1, j-1) + gap_log_odds << " " << match_log_odds << " " << gap_log_odds << " " << ngap_log_odds << " max: " << m_max << "\n";
@@ -583,9 +606,37 @@ class log_odds_viterbi {
//lof_t old_m = m_[j];
}
+
+ if( verbose ) {
+ std::cout << "\n";
+ }
}
- return m_.back();
+
+ float max_score = 0;
+ {
+ size_t lr_start = std::min( qlen, m_.size() - 1 );
+ //auto max_it = std::max_element( m_.begin() + lr_start, m_.end() );
+
+ for( size_t i = lr_start; i != m_.size(); ++i ) {
+ max_score = max_score + log( 1 + exp( m_[i] - max_score ));
+ }
+
+ if( max_score == 0 ) {
+ std::cout << "meeeep\n";
+
+ std::copy( m_.begin() + lr_start, m_.end(), std::ostream_iterator<float>( std::cout, " " ));
+ std::cout << "\n";
+ getchar();
+
+ }
+
+// assert( max_it != m_.end() );
+// max_score = *max_it;
+
+ }
+ return max_score;
+// return m_.back();
}
dmat ref_state_prob_;
@@ -650,7 +701,7 @@ class log_odds_aligner {
public:
log_odds_aligner( const dmat &state, const dsvec &gap, boost::array<double,4> state_freq, const dsvec &pc_gap_freq )
- : ref_state_prob_(state), ref_gap_prob_(gap), ref_len_(state.size1()),
+ : ref_state_prob_(state), ref_gap_prob_(gap), ref_len_(state.size2()),
state_freq_(state_freq),
pc_gap_freq_( pc_gap_freq ),
neg_inf_( -std::numeric_limits<lof_t>::infinity() ),
@@ -687,15 +738,36 @@ class log_odds_aligner {
}
void precalc_log_odds() {
- ref_state_lo_.resize( ref_state_prob_.size2(), ref_state_prob_.size1() );
+ ref_state_lo_.resize( ref_state_prob_.size1(), ref_state_prob_.size2() );
+ const bool print = false;
+
for( size_t i = 0; i < 4; ++i ) {
- const ublas::matrix_column<dmat> pcol( ref_state_prob_, i );
+ const ublas::matrix_row<dmat> pcol( ref_state_prob_, i );
ublas::matrix_row<lomat> lorow( ref_state_lo_, i );
std::transform( pcol.begin(), pcol.end(), lorow.begin(), log_odds(state_freq_[i]));
- }
+
+ if( print ) {
+ std::cout << "lo row: " << i << "\n";
+ if( i == 0 ) {
+ size_t j = 0;
+ std::for_each( lorow.begin(), lorow.end(), [&](double v) { std::cout << std::setw(10) << j++; });
+ std::cout << "\n";
+
+ }
+
+ std::for_each( lorow.begin(), lorow.end(), [&](double v) { std::cout << std::setw(10) << v; });
+ std::cout << "\n";
+
+// getchar();
- //const double gap_freq = 0.83;
+ }
+ }
+
+ if( print ) {
+ throw std::runtime_error( "exit" );
+ }
+ //const double gap_freq = 0.83;
#if 0
@@ -747,8 +819,14 @@ class log_odds_aligner {
//dmat ref_state_trans = trans(ref_state_prob_);
+ const bool verbose = false;
-
+ if( verbose ) {
+ std::cout << "align\n";
+ }
+
+ //auto m_row = m_.begin1();
+
for( size_t i = 1; i < qlen + 1; ++i ) {
const int b = qs[i-1];
// std::cout << "b: " << b << "\n";
@@ -760,6 +838,19 @@ class log_odds_aligner {
// const ublas::matrix_column<dmat> ngap_prob( ref_gap_prob_, 0 );
// const ublas::matrix_column<dmat> gap_prob( ref_gap_prob_, 1 );
+ auto m_11 = m_.find2(0,i-1,0);
+ auto d_11 = d_.find2(0,i-1,0);
+ auto i_11 = i_.find2(0,i-1,0);
+
+ auto m_00 = m_.find2(0,i,1);
+ auto i_00 = i_.find2(0,i,1);
+
+ auto m_01 = m_.find2(0,i,0);
+ auto d_01 = d_.find2(0,i,0);
+
+// auto d_11 = d_.find2(i-1,0);
+// auto i_11 = i_.find2(i-1,0);
+
for( size_t j = 1; j < ref_len_ + 1; ++j ) {
//ublas::matrix_row<dmat> a_state(ref_state_prob_, j-1 );
//ublas::matrix_row<dmat> a_gap(ref_gap_prob_, j-1 );
@@ -772,6 +863,120 @@ class log_odds_aligner {
match_log_odds = -100;
}
+
+#if 1
+ lof_t gap_log_odds = ref_gap_lo_[j-1];
+ lof_t ngap_log_odds = ref_ngap_lo_[j-1];
+// std::cout << gap_log_odds << " " << ngap_log_odds << "\n";
+// std::cout << "match: " << match_log_odds << "\n";
+#else
+ lof_t gap_log_odds = 0;
+ lof_t ngap_log_odds = 0;
+#endif
+
+ lof_t m_max = max3(
+ *m_11 + ngap_log_odds,
+ *d_11 + gap_log_odds,
+ *i_11 + gap_log_odds
+ );
+
+ *m_00 = m_max + match_log_odds;
+
+#if 0
+ std::cout << i << " " << j << " " << m_(i,j) << " : " << m_(i-1, j-1) + ngap_log_odds
+ << " " << d_(i-1, j-1) + gap_log_odds << " " << i_(i-1, j-1) + gap_log_odds << " " << match_log_odds << " " << gap_log_odds << " " << ngap_log_odds << " max: " << m_max << "\n";
+#endif
+
+ auto m_10 = ++m_11;
+ auto i_10 = ++i_11;
+ if( verbose ) {
+ std::cout << std::setw(10) << m_(i,j);
+// std::cout << std::setw(10) << match_log_odds;
+ }
+ lof_t i_max = std::max(
+ *m_10 + delta_log_,
+ *i_10 + epsilon_log_
+ );
+ *i_00 = i_max;
+
+ lof_t d_max = std::max(
+ *m_01 + delta_log_,
+ *d_01 + epsilon_log_
+ );
+
+ auto d_00 = ++d_01;
+ *d_00 = d_max;
+
+ ++d_11;
+ ++m_00;
+ ++i_00;
+ ++m_01;
+
+
+ }
+ if( verbose ) {
+ std::cout << "\n";
+ }
+ }
+ {
+
+ ublas::matrix_row<lomat> m_last(m_, qlen);
+ ublas::matrix_row<lomat>::iterator max_it;
+
+
+ // start point for searching the maximum in the last row
+ // this is either qlen (=intersection of last row and diagonal) or the last
+ // column (if qlen > reflen).
+ size_t lr_start = std::min( qlen, m_last.size() - 1 );
+
+ max_it = std::max_element( m_last.begin() + lr_start, m_last.end() );
+ max_col_ = std::distance(m_last.begin(), max_it);
+ max_row_ = qlen;
+ max_score_ = *max_it;
+
+
+
+ return max_score_;
+ }
+ }
+
+
+ double align2( const std::vector<uint8_t> &qs ) {
+ const size_t qlen = qs.size();
+
+ setup( qlen );
+
+ //dmat ref_state_trans = trans(ref_state_prob_);
+
+ const bool verbose = false;
+
+ if( verbose ) {
+ std::cout << "align\n";
+ }
+
+ for( size_t i = 1; i < qlen + 1; ++i ) {
+ const int b = qs[i-1];
+ // std::cout << "b: " << b << "\n";
+
+ //const double b_freq = state_freq_.at(b);
+ //const ublas::matrix_column<dmat> b_state( ref_state_prob_, b );
+ const ublas::matrix_row<lomat> b_state_lo( ref_state_lo_, b );
+
+ // const ublas::matrix_column<dmat> ngap_prob( ref_gap_prob_, 0 );
+ // const ublas::matrix_column<dmat> gap_prob( ref_gap_prob_, 1 );
+
+ for( size_t j = 1; j < ref_len_ + 1; ++j ) {
+ //ublas::matrix_row<dmat> a_state(ref_state_prob_, j-1 );
+ //ublas::matrix_row<dmat> a_gap(ref_gap_prob_, j-1 );
+
+ //double match_log_odds = log( b_state[j-1] / b_freq );
+ lof_t match_log_odds = b_state_lo[j-1];
+
+ if( match_log_odds < -100 ) {
+ // std::cout << "odd: " << match_log_odds << b_state[j-1] << " " << b_freq << " " << b << "\n";
+ match_log_odds = -100;
+ }
+
lof_t gap_log_odds = ref_gap_lo_[j-1];
lof_t ngap_log_odds = ref_ngap_lo_[j-1];
lof_t m_max = max3(
@@ -786,6 +991,12 @@ class log_odds_aligner {
std::cout << i << " " << j << " " << m_(i,j) << " : " << m_(i-1, j-1) + ngap_log_odds
<< " " << d_(i-1, j-1) + gap_log_odds << " " << i_(i-1, j-1) + gap_log_odds << " " << match_log_odds << " " << gap_log_odds << " " << ngap_log_odds << " max: " << m_max << "\n";
#endif
+
+
+ if( verbose ) {
+ std::cout << std::setw(10) << m_(i,j);
+// std::cout << std::setw(10) << match_log_odds;
+ }
lof_t i_max = std::max(
m_(i-1,j) + delta_log_,
i_(i-1,j) + epsilon_log_
@@ -800,12 +1011,22 @@ class log_odds_aligner {
d_(i,j) = d_max;
}
+ if( verbose ) {
+ std::cout << "\n";
+ }
}
{
+
ublas::matrix_row<lomat> m_last(m_, qlen);
ublas::matrix_row<lomat>::iterator max_it;
-
- max_it = std::max_element( m_last.begin() + qlen, m_last.end() );
+
+
+ // start point for searching the maximum in the last row
+ // this is either qlen (=intersection of last row and diagonal) or the last
+ // column (if qlen > reflen).
+ size_t lr_start = std::min( qlen, m_last.size() - 1 );
+
+ max_it = std::max_element( m_last.begin() + lr_start, m_last.end() );
max_col_ = std::distance(m_last.begin(), max_it);
max_row_ = qlen;
max_score_ = *max_it;
@@ -815,8 +1036,7 @@ class log_odds_aligner {
return max_score_;
}
}
-
-
+
void traceback( std::vector<uint8_t> *tb ) {
assert( tb != 0 );
@@ -1014,18 +1234,21 @@ class my_fact : public ivy_mike::tree_parser_ms::node_data_factory {
class queries {
public:
+ typedef sequence_model::model<sequence_model::tag_dna4> seq_model;
+
void load_fasta( const char *name ) {
std::ifstream is( name );
assert( is.good() );
assert( names_.size() == raw_seqs_.size() );
- read_fasta( is, names_, raw_seqs_ );
+ ivy_mike::read_fasta( is, names_, raw_seqs_ );
}
- // WARNING: unsafe move semantics on parameter seq!
- void add( const std::string &name, std::vector<uint8_t> &seq ) {
+
+ void add_move( const std::string &name, std::vector<uint8_t> &&seq ) {
names_.push_back(name);
- ivy_mike::push_back_swap(raw_seqs_, seq );
+ //ivy_mike::push_back_swap(raw_seqs_, seq );
+ raw_seqs_.emplace_back( seq );
}
@@ -1036,13 +1259,26 @@ class queries {
for( size_t i = 0; i < raw_seqs_.size(); ++i ) {
seqs_.at(i).clear();
- // recode non-gap characters in raw_seq into seq
- // once again i'm starting doubt that it's a good idea to do _everyting_
- // with stl algorithms *g*.
-
- std::transform( raw_seqs_[i].begin(), raw_seqs_[i].end(),
- back_insert_ifer(seqs_[i], std::bind2nd(std::less_equal<uint8_t>(), 3 ) ),
- encode_dna );
+ // recode (ascii character to canonical state space) non-gap characters in raw_seq into seq
+
+
+ std::for_each( raw_seqs_[i].begin(), raw_seqs_[i].end(),
+ [&]( uint8_t rc ) {
+ uint8_t cc = seq_model::s2c(rc);
+ if( !seq_model::cstate_is_gap(cc) ) {
+ seqs_[i].push_back(cc);
+ }
+
+ });
+
+// std::cout << "qs: " << i << " " << names_.at(i) << "\n";
+
+// std::transform( m_qs_seqs[i].begin(), m_qs_seqs[i].end(),
+// back_insert_ifer(m_qs_cseqs[i], std::not1( std::ptr_fun(seq_model::cstate_is_gap) )),
+// seq_model::s2c );
+// std::transform( raw_seqs_[i].begin(), raw_seqs_[i].end(),
+// back_insert_ifer(seqs_[i], std::bind2nd(std::less_equal<uint8_t>(), 3 ) ),
+// encode_dna );
}
}
@@ -1069,6 +1305,7 @@ class queries {
return m;
}
+ #if 0
static void seq_to_position_map( const std::vector<uint8_t> &seq, std::vector<int> *map ) {
for( size_t i = 0; i < seq.size(); ++i ) {
if( is_dna(seq[i]) ) {
@@ -1122,6 +1359,7 @@ class queries {
}
return std::numeric_limits<uint8_t>::max();
}
+#endif
private:
std::vector<std::string> names_;
std::vector<std::vector<uint8_t> > raw_seqs_; // seqs in the source alphabet (e.g. ACGT)
@@ -1134,6 +1372,8 @@ class queries {
class references {
public:
+ typedef sequence_model::model<sequence_model::tag_dna4> seq_model;
+
references( sptr::shared_ptr<ln_pool> pool, const std::string &tree_name, const std::string &ali_name )
: pool_(pool), tree_name_(tree_name), ali_name_(ali_name)
{}
@@ -1174,7 +1414,7 @@ class references {
ivy_mike::push_back_swap( ref_names_, ref_ma.names[i]);
ivy_mike::push_back_swap( ref_seqs_, ref_ma.data[i]);
} else {
- qs.add( ref_ma.names[i], ref_ma.data[i] );
+ qs.add_move( ref_ma.names[i], std::move(ref_ma.data[i]) );
}
}
}
@@ -1295,6 +1535,10 @@ class references {
const std::vector<std::vector<uint8_t> > &ref_seqs() const {
return ref_seqs_;
}
+
+ size_t ref_len() const {
+ return ref_seqs_.front().size();
+ }
const std::vector<double> &per_column_gap_freq() const {
return per_column_gap_freq_;
@@ -1362,7 +1606,7 @@ class references {
assert( it->size() == gap_counts.size() );
for( size_t i = 0; i < gap_counts.size(); ++i ) {
- if( !queries::is_dna((*it)[i]) ) {
+ if( !seq_model::sstate_is_character((*it)[i]) ) {
gap_counts[i]++;
}
}
@@ -1377,7 +1621,25 @@ class references {
// std::cout << "\n";
}
-
+// static bool is_dna( uint8_t c ) {
+// switch( c ) {
+// case 'a':
+// case 'c':
+// case 'g':
+// case 't':
+// case 'A':
+// case 'C':
+// case 'G':
+// case 'T':
+// case 'U':
+// case 'u':
+// return true;
+// default:
+// {}
+// }
+// return false;
+// }
+
sptr::shared_ptr<ln_pool> pool_;
const std::string tree_name_;
@@ -1420,9 +1682,8 @@ struct scoring_results {
scoring_results( size_t num_qs )
: best_score_(num_qs, -std::numeric_limits<double>::infinity()),
- best_ref_(num_qs, size_t(-1)),
- best_tb_(num_qs),
- os_("scores.txt")
+ best_ref_(num_qs, size_t(-1))
+// os_("scores.txt")
{}
@@ -1435,12 +1696,13 @@ struct scoring_results {
// if offer return true, this means that the qs/ref pair is currently best-scoring
// the thread can crate the traceback and set it later with 'set_trace'
bool offer( size_t qs, size_t ref, double score_ali, double score_vit ) {
- boost::lock_guard<boost::mutex> lock(mtx_);
-
- os_ << qs << " " << ref << " " << score_ali << " " << score_vit << "\n";
-
- const double score = score_vit;
+// boost::lock_guard<boost::mutex> lock(mtx_);
+ if( qs == 0 ) {
+ os_ << qs << " " << ref << " " << score_ali << " " << score_vit << "\n";
+ }
+// const double score = score_vit;
+ const double score = score_ali;
if( best_score_.at(qs) < score || (delta_equal(best_score_.at(qs), score) && ref < best_ref_.at(qs))) {
best_score_[qs] = score;
best_ref_.at(qs) = ref;
@@ -1450,20 +1712,45 @@ struct scoring_results {
return false;
}
- // offer a traceback for a qs/ref pair. It is declined if it is no longer the best scoring pair
- void set_trace( size_t qs, size_t ref, std::vector<uint8_t> *trace ) {
- boost::lock_guard<boost::mutex> lock(mtx_);
+// // offer a traceback for a qs/ref pair. It is declined if it is no longer the best scoring pair
+// void set_trace( size_t qs, size_t ref, std::vector<uint8_t> *trace ) {
+// boost::lock_guard<boost::mutex> lock(mtx_);
+//
+// if( best_ref_.at(qs) == ref ) {
+// best_tb_.at(qs).swap(*trace);
+// } else {
+// std::cerr << "declined stale traceback\n";
+// }
+// }
- if( best_ref_.at(qs) == ref ) {
- best_tb_.at(qs).swap(*trace);
- } else {
- std::cerr << "declined stale traceback\n";
+ void merge( const scoring_results &other ) {
+ boost::lock_guard<boost::mutex> lock(mtx_);
+
+ const size_t size = best_score_.size();
+ assert( size == best_ref_.size() );
+
+ assert( other.best_score_.size() == other.best_ref_.size() ); // make sure that other is sane itself.
+
+ if( size != other.best_score_.size() ) {
+ throw std::runtime_error( "inconsistent sizes in scoring_result::merge" );
+
+ }
+
+ for( size_t i = 0; i < best_score_.size(); ++i ) {
+ float other_score = other.best_score_[i];
+ size_t other_ref = other.best_ref_[i];
+
+ if( best_score_[i] < other_score || (delta_equal(best_score_[i], other_score) && other_ref < best_ref_[i])) {
+ best_score_[i] = other_score;
+ best_ref_[i] = other_ref;
+ }
}
+
}
-
+
std::vector<double> best_score_;
std::vector<size_t> best_ref_;
- std::vector<std::vector<uint8_t> > best_tb_;
+// std::vector<std::vector<uint8_t> > best_tb_;
std::ofstream os_;
@@ -1472,6 +1759,9 @@ struct scoring_results {
};
+
+
+
class scoring_worker {
public:
@@ -1486,20 +1776,23 @@ class scoring_worker {
void operator()() {
std::vector<uint8_t> tb_tmp;
-
+ std::vector<uint8_t> rtb_tmp;
+
uint64_t cups = 0;
ivy_mike::timer t1;
+ scoring_results local_res( res_->best_score_.size() );
+
for( size_t i = rank_; i < refs_.node_size(); i+=num_workers_ ) {
const lnode *a = refs_.get_node(i);
assert( a->towards_root );
assert( a->m_data != 0 );
const my_adata *ma = dynamic_cast<const my_adata *>(a->m_data.get());
- log_odds_viterbi vit( ma->state_probs(), ma->gap_probs(), refs_.base_freqs() );
+// log_odds_viterbi vit( ma->state_probs(), ma->gap_probs(), refs_.base_freqs() );
log_odds_aligner ali_score( ma->state_probs(), ma->gap_probs(), refs_.base_freqs(), refs_.per_column_gap_freq() );
//log_odds_aligner_score_only ali_score( ma->state_probs(), ma->gap_probs(), refs_.base_freqs() );
@@ -1513,17 +1806,32 @@ class scoring_worker {
const std::vector<uint8_t> &b = qs_.get_recoded(j);
- cups += ma->state_probs().size1() * b.size();
+ cups += ma->state_probs().size2() * b.size();
//double score2 = ali.align(b);
double score = ali_score.align(b);
- double score_vit = vit.align(b);
-
- //std::cout << "score: " << score << " " << score2 << "\n";
+// double score_vit = vit.align(b);
+ double score_vit = 0;
+
+// std::cout << "score: " << score << " " << score_vit << "\n";
+
+ if ( false ) {
+ tb_tmp.clear();
+ ali_score.traceback(&tb_tmp);
+ rtb_tmp.clear();
+ align_utils::realize_trace( b, tb_tmp, &rtb_tmp );
+
+// os << std::setw(max_name_len+1) << std::left << qs.get_name(j);
+ std::copy( rtb_tmp.begin(), rtb_tmp.end(), std::ostream_iterator<char>(std::cout));
+ std::cout << "\n";
+ }
//assert( score == score2 );
- //std::cout << "score: " << j << " " << i << " " << score << "\n";
- if( res_->offer(j, i, score, score_vit ) ) {
+// std::cout << "score: " << j << " " << i << " " << score << "\n";
+
+
+
+ if( local_res.offer(j, i, score, score_vit ) ) {
//tb_tmp.clear();
// ali.traceback(&tb_tmp);
@@ -1533,13 +1841,17 @@ class scoring_worker {
// if( j == 0 ) {
// std::cout << "score: " << score << "\n";
// }
+
+// getchar();
}
//std::cout << "score: " << score << "\n";
}
+
+ res_->merge(local_res);
std::cout << "time: " << t1.elapsed() << " " << cups / (t1.elapsed()*1e9) << " gncup/s\n";
}
@@ -1616,7 +1928,7 @@ int main( int argc, char *argv[] ) {
log_device ldev(std::cout, logs);
log_stream_guard lout_guard(lout, ldev);
lout << "bla\n";
- sptr::shared_ptr<ln_pool> pool(new ln_pool(std::auto_ptr<node_data_factory>(new my_fact)));
+ sptr::shared_ptr<ln_pool> pool(new ln_pool(ln_pool::fact_ptr_type(new my_fact)));
queries qs;
if(qs_name != 0){
qs.load_fasta(qs_name);
@@ -1653,22 +1965,26 @@ int main( int argc, char *argv[] ) {
std::cout << "time: " << t1.elapsed() << " " << cups / (t1.elapsed()*1e9) << " gncup/s\n";
- std::vector<uint8_t> tb;
- std::vector<uint8_t> rtb;
- std::string out_name(filename( opt_run_name, "aligned_qs" ));
- std::ofstream os( out_name.c_str());
+// std::vector<uint8_t> tb;
+
+ std::string out_name(filename( opt_run_name, "alignment" ));
+// std::ofstream os( out_name.c_str());
std::string qual_name(filename( opt_run_name, "quality" ));
std::ofstream os_qual( qual_name.c_str());
- const size_t max_name_len = write_ref_seqs( os, refs, qs.size(), qs.max_name_length() );
+// const size_t max_name_len = write_ref_seqs( os, refs, qs.size(), qs.max_name_length() );
double qual = 0.0;
int num_qual = 0;
-
+
+ size_t max_name_len = 0;
+
+ std::vector<std::vector<uint8_t> > qs_traces(qs.size());
+ std::vector<double > qs_scores(qs.size());
for( size_t i = 0; i < refs.node_size(); ++i )
{
@@ -1684,62 +2000,146 @@ int main( int argc, char *argv[] ) {
assert( res.best_ref_.size() == qs.size() );
log_odds_aligner ali( ma->state_probs(), ma->gap_probs(), refs.base_freqs(), refs.per_column_gap_freq() );
+
+ max_name_len = std::max( max_name_len, refs.ref_names().at(i).size() );
+
for( size_t j = 0; j < qs.size(); ++j ) {
+ if( i == 0 ) {
+ // on the first round through the refs also search for the maximum qs name length
+ max_name_len = std::max( max_name_len, qs.get_name(j).size() );
+ }
+
if( res.best_ref_[j] != i ) {
continue;
}
const std::vector<uint8_t> &b = qs.get_recoded(j);
double score = ali.align(b);
-
+ qs_scores.at(j) = score;
//assert( score == res.best_score_.at(j) );
- tb.clear();
- ali.traceback(&tb);
+// tb.clear();
+// ali.traceback(&tb);
+ ali.traceback( &qs_traces.at(j) );
+ }
+ }
- rtb.clear();
- align_utils::realize_trace( b, tb, &rtb );
- os << std::setw(max_name_len+1) << std::left << qs.get_name(j);
- std::copy( rtb.begin(), rtb.end(), std::ostream_iterator<char>(os));
- os << "\n";
+ papara::output_alignment_phylip oa( out_name.c_str() );
+ typedef sequence_model::model<sequence_model::tag_dna4> seq_model;
+
+ // collect ref gaps introduiced by qs
+ const size_t pad = max_name_len + 1;
+ const bool ref_gaps = true;
+
+ papara::ref_gap_collector rgc( refs.ref_len() );
+ for( std::vector<std::vector<uint8_t> >::iterator it = qs_traces.begin(); it != qs_traces.end(); ++it ) {
+ rgc.add_trace(*it);
+ }
+
+
+
+ const size_t num_refs = refs.ref_seqs().size();
+
+ if( ref_gaps ) {
+ oa.set_size(num_refs + qs.size(), rgc.transformed_ref_len());
+ } else {
+ oa.set_size(num_refs + qs.size(), refs.ref_len() );
+ }
+ oa.set_max_name_length( pad );
+
+ // write refs (and apply the ref gaps)
+
+ std::vector<char> tmp;
+ for( size_t i = 0; i < refs.ref_seqs().size(); i++ ) {
+ tmp.clear();
+
+
+
+ if( ref_gaps ) {
+ rgc.transform( refs.ref_seqs().at(i).begin(), refs.ref_seqs().at(i).end(), std::back_inserter(tmp), '-' );
+ } else {
+ std::transform( refs.ref_seqs().at(i).begin(), refs.ref_seqs().at(i).end(), std::back_inserter(tmp), seq_model::normalize);
+ }
+
+ oa.push_back( refs.ref_names().at(i), tmp, papara::output_alignment::type_ref );
+ //std::transform( m_ref_seqs[i].begin(), m_ref_seqs[i].end(), std::ostream_iterator<char>(os), seq_model::normalize );
+
+ }
+
+
+
+ std::vector<uint8_t>out_qs_cstate;
+ std::vector<char>out_qs_char;
+ for( size_t i = 0; i < qs.size(); i++ ) {
+ tmp.clear();
+ const std::vector<uint8_t> &qp = qs.get_recoded(i);
+
+ out_qs_cstate.clear();
+
+ // realize+write the tracebacks. this is done in cstate (=canonical state) space of the dna4 model (=the
+ // sequences don't contain gaps/ambiguous states)
+
+
+ if( ref_gaps ) {
+ papara::gapstream_to_alignment(qs_traces.at(i), qp, &out_qs_cstate, seq_model::gap_cstate(), rgc);
+ } else {
+ papara::gapstream_to_alignment_no_ref_gaps(qs_traces.at(i), qp, &out_qs_cstate, seq_model::gap_cstate() );
+ }
+
+ //os << std::setw(pad) << std::left << qs.name_at(i);
+// std::transform( out_qs_ps.begin(), out_qs_ps.end(), std::back_inserter(tmp), seq_model::p2s );
+
+
+ // FIXME: this looks like a design quirk: why is papara::output_alignment based on char sequences while
+ // everywhere else uint8_t seqeunces?
+ //out_qs_char.assign( out_qs.begin(), out_qs.end() );
+
+ // transform from canonical state space into (ascii) sequence space
+
+ out_qs_char.clear();
+ std::transform( out_qs_cstate.begin(), out_qs_cstate.end(), std::back_inserter(out_qs_char), seq_model::c2s );
+
+ oa.push_back( qs.get_name(i), out_qs_char, papara::output_alignment::type_qs );
- {
- std::vector<int> map_ref;
- std::vector<int> map_aligned;
- queries::seq_to_position_map( qs.get_raw(j), &map_ref );
- align_utils::trace_to_position_map( tb, &map_aligned );
- if( map_ref.size() != map_aligned.size() ) {
- throw std::runtime_error( "alignment quirk: map_ref.size() != map_aligned.size()" );
- }
- size_t num_equal = ivy_mike::count_equal( map_ref.begin(), map_ref.end(), map_aligned.begin() );
+#if 0
+ if( os_quality.good() && qs.seq_at(i).size() == refs.pvec_size()) {
- //std::cout << "size: " << map_ref.size() << " " << map_aligned.size() << " " << m_qs_seqs[i].size() << "\n";
- //std::cout << num_equal << " equal of " << map_ref.size() << "\n";
- double qscore = num_equal / double(map_ref.size());
- //std::cout << "score: " << qscore << "\n";
- qual += qscore;
- ++num_qual;
+ std::vector<int> map_ref;
+ std::vector<int> map_aligned;
+ seq_to_position_map( qs.seq_at(i), map_ref );
+ align_utils::trace_to_position_map( qs_traces[i], &map_aligned );
- os_qual << qs.get_name(j) << " " << qscore << " " << score << "\n";
+ if( map_ref.size() != map_aligned.size() ) {
+ throw std::runtime_error( "alignment quirk: map_ref.size() != map_aligned.size()" );
}
+ size_t num_equal = ivy_mike::count_equal( map_ref.begin(), map_ref.end(), map_aligned.begin() );
+
+ //std::cout << "size: " << map_ref.size() << " " << map_aligned.size() << " " << m_qs_seqs[i].size() << "\n";
+ //std::cout << num_equal << " equal of " << map_ref.size() << "\n";
+ double score = num_equal / double(map_ref.size());
+ //double score = alignment_quality( out_qs, m_qs_seqs[i], debug );
+ os_quality << qs.name_at(i) << " " << score << "\n";
+ mean_quality += score;
+ n_quality += 1;
}
- }
+#endif
+ }
std::cout << t.elapsed() << std::endl;
lout << "SUCCESS " << t.elapsed() << std::endl;
View
9 pvec.h
@@ -267,6 +267,10 @@ class probgap_model {
reset( seqs );
}
+ probgap_model( double gap_freq ) : m_valid( false ) {
+ reset( gap_freq );
+ }
+
void reset( const std::vector< std::vector<uint8_t> > &seqs ) {
// initialize probgap model from input sequences
reset( calc_gap_freq( seqs ) );
@@ -342,13 +346,14 @@ class pvec_pgap {
public:
+ // WARNING WARNING WARNING: this is the stupid_pointer, used to inject a global probgap_model into class pvec_pgap
static ivy_mike::stupid_ptr<probgap_model> pgap_model;
- inline const std::vector<parsimony_state> &get_v() {
+ inline const std::vector<parsimony_state> &get_v() const {
return v;
}
- inline const boost::numeric::ublas::matrix<double> &get_gap_prob() {
+ inline const boost::numeric::ublas::matrix<double> &get_gap_prob() const {
return gap_prob;
}
View
82 raxml_interface.cpp
@@ -350,7 +350,7 @@ void optimize_branch_lengths( ivy_mike::tree_parser_ms::lnode *tree, const std::
perf_timer.print();
}
-
+#if 0
lnode *optimize_branch_lengths2( ivy_mike::tree_parser_ms::lnode *tree, const std::map<std::string, const std::vector<uint8_t> * const> &name_to_seq, ln_pool &pool ) {
ivy_mike::perf_timer perf_timer(!true);
@@ -374,7 +374,7 @@ lnode *optimize_branch_lengths2( ivy_mike::tree_parser_ms::lnode *tree, const st
std::map<std::string, sptr::shared_ptr<lnode> > tip_map;
for( std::vector<lnode *>::iterator it = tc.m_nodes.begin(); it != tc.m_nodes.end(); ++it ) {
- tip_map.insert( std::make_pair( (*it)->m_data->tipName, (*it)->get_smart_ptr() ) );
+ tip_map.insert( std::make_pair( (*it)->m_data->tipName, (*it)->get_smart_ptr().lock() ) );
}
@@ -501,7 +501,7 @@ lnode *optimize_branch_lengths2( ivy_mike::tree_parser_ms::lnode *tree, const st
assert( ret_node->back->back != 0 );
return ret_node;
}
-
+#endif
//size_t file_size( const char *name ) {
// std::ifstream is(name);
//
@@ -550,7 +550,7 @@ std::string digest_files( const std::vector<std::string> &files ) {
namespace ublas = boost::numeric::ublas;
void launch_or_not( const std::string &raxml, Poco::Process::Args args, const std::string &digest, std::vector<std::string> *out_files ) {
- std::string temp_name = "/tmp/";
+ std::string temp_name = "/space/tmp/";
temp_name += "propara_";
temp_name += digest;
@@ -630,6 +630,45 @@ void launch_or_not( const std::string &raxml, Poco::Process::Args args, const st
}
+std::vector<ublas::matrix<double> > read_binary_anc_probs( std::istream &pis ) {
+ std::vector<ublas::matrix<double> > pvecs;
+
+
+
+ while( !pis.eof() ) {
+ int32_t counter;
+ pis.read((char*)&counter, 4 );
+
+
+
+ if( counter == -1 ) {
+ break;
+ }
+
+ assert( size_t(counter) == pvecs.size() );
+
+ int32_t width;
+ pis.read((char*)&width, 4 );
+
+// std::cout << "width: " << width << "\n";
+ assert( width > 0 );
+
+
+
+ //pvecs->push_back(ublas::matrix<double>(width, 4));
+ //ublas::matrix<double> &mat = pvecs->back();
+ ublas::matrix<double> mat( width, 4 );
+
+ // the underlying unbounded_array seems to be guaranteed to have a sensible memory layout. read straight into it.
+ // TODO: get fancy and directly back the boost::matrix with the mmaped binary file ;-)
+ pis.read((char*)mat.data().begin(), width * 4 * 8 );
+
+ pvecs.emplace_back( ublas::trans(mat) );
+ }
+
+ return pvecs;
+}
+
lnode *generate_marginal_ancestral_state_pvecs( ln_pool &pool, const std::string &tree_name, const std::string &ali_name, std::vector<ublas::matrix<double> > *pvecs ) {
ivy_mike::perf_timer perf_timer(!true);
@@ -696,42 +735,12 @@ lnode *generate_marginal_ancestral_state_pvecs( ln_pool &pool, const std::string
// this is a huge improvement over the text file stuff (below) and now reads at 1200Mb/s
// from warm disk-cache instead of 20 Mb/s...
- pvecs->clear();
+ //pvecs->clear();
+ *pvecs = read_binary_anc_probs( pis );
ivy_mike::timer t1;
- std::vector<double> tmp;
- std::string line;
- std::vector<double> token;
- std::stringstream strstr;
-
- while( !pis.eof() ) {
- int32_t counter;
- pis.read((char*)&counter, 4 );
-
- if( counter == -1 ) {
- break;
- }
-
- assert( size_t(counter) == pvecs->size() );
-
- int32_t width;
- pis.read((char*)&width, 4 );
-
-// std::cout << "width: " << width << "\n";
- assert( width > 0 );
-
-
-
- pvecs->push_back(ublas::matrix<double>(width, 4));
- ublas::matrix<double> &mat = pvecs->back();
-
- // the underlying unbounded_array seems to be guaranteed to have a sensible memory layout. read straight into it.
- // TODO: get fancy and directly back the boost::matrix with the mmaped binary file ;-)
- pis.read((char*)mat.data().begin(), width * 4 * 8 );
- }
-
size_t s = -1;
for( size_t i = 0; i < pvecs->size(); ++i ) {
if( s == size_t(-1) ) {
@@ -903,3 +912,4 @@ lnode *generate_marginal_ancestral_state_pvecs( ln_pool &pool, const std::string
}
#endif
+
View
4 raxml_interface.h
@@ -29,11 +29,15 @@
#include <stdint.h>
#include <boost/array.hpp>
#include <boost/numeric/ublas/fwd.hpp>
+
+#include <iosfwd>
+
void optimize_branch_lengths( ivy_mike::tree_parser_ms::lnode *tree, const std::map<std::string, const std::vector<uint8_t> * const> &name_to_seq );
ivy_mike::tree_parser_ms::lnode *optimize_branch_lengths2( ivy_mike::tree_parser_ms::lnode *tree, const std::map<std::string, const std::vector<uint8_t> * const> &name_to_seq, ivy_mike::tree_parser_ms::ln_pool &pool );
+std::vector<boost::numeric::ublas::matrix<double> > read_binary_anc_probs( std::istream &pis );
//ivy_mike::tree_parser_ms::lnode *generate_marginal_ancestral_state_pvecs( ivy_mike::tree_parser_ms::ln_pool &pool, const std::string &tree_name, const std::string &ali_name, std::vector<boost::array<std::vector<double>, 4> > *pvecs );
ivy_mike::tree_parser_ms::lnode *generate_marginal_ancestral_state_pvecs( ivy_mike::tree_parser_ms::ln_pool &pool, const std::string &tree_name, const std::string &ali_name, std::vector<boost::numeric::ublas::matrix<double> > *pvecs );
View
12 sequence_model.cpp
@@ -24,6 +24,7 @@
using sequence_model::model;
using sequence_model::tag_aa;
using sequence_model::tag_dna;
+using sequence_model::tag_dna4;
namespace raxml_aa_meaning {
// is it officially legal to initialize const static members in the header? I guess c++ removes redundant definitions during linking...
@@ -46,5 +47,16 @@ const char inverse[16] = {'_', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y
+
+
const std::vector<char> model<tag_dna>::inverse_meaning(raxml_dna_meaning::inverse, raxml_dna_meaning::inverse + ivy_mike::arrlen(raxml_dna_meaning::inverse));
//const std::vector<uint8_t> model<tag_dna>::bit_vector(raxml_dna_meaning::bitvector, raxml_dna_meaning::bitvector + ivy_mike::arrlen(raxml_dna_meaning::bitvector));
+
+
+namespace raxml_dna4_meaning {
+const char inverse[5] = {'A', 'C', 'G', 'T', '-' };
+//const char bitvector[17] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 };
+}
+
+
+const std::vector<char> model<tag_dna4>::inverse_meaning(raxml_dna4_meaning::inverse, raxml_dna4_meaning::inverse + ivy_mike::arrlen(raxml_dna4_meaning::inverse));
View
79 sequence_model.h
@@ -57,7 +57,9 @@ inline size_t popcount( unsigned int v ) {
namespace sequence_model {
+
class tag_dna;
+class tag_dna4;
class tag_aa;
@@ -163,6 +165,83 @@ class model<tag_dna> {
};
+
+template<>
+class model<tag_dna4> {
+public:
+
+ typedef uint8_t pars_state_t;
+
+ const static std::vector<char> inverse_meaning;
+// const static std::vector<pars_state_t> bit_vector;
+
+ static uint8_t normalize( size_t xc ) {
+ assert( xc <= 255 );
+
+ int c = int(xc);
+ c = std::toupper(c);
+
+ switch( c ) {
+ case 'U':
+ return 'T';
+ case 'N':
+ case '?':
+ return '-';
+ default:
+ return c;
+ }
+
+ }
+
+
+ static bool sstate_is_character( uint8_t c ) {
+ c = normalize(c);
+ ptrdiff_t idx = std::distance(inverse_meaning.begin(),
+ std::find(inverse_meaning.begin(), inverse_meaning.end(), c ) );
+
+ return idx >= 0 && idx <= gap_cstate();
+
+
+ }
+
+ static uint8_t s2c( size_t c ) {
+ c = normalize(c);
+ ptrdiff_t idx = std::distance(inverse_meaning.begin(),
+ std::find(inverse_meaning.begin(), inverse_meaning.end(), c ) );
+
+ assert( idx >= 0 );
+
+ if( size_t(idx) >= inverse_meaning.size() ) {
+ std::cerr << "illegal character: " << int(c) << "\n";
+ throw std::runtime_error( "illegal character in DNA/RNA sequence");
+ }
+
+ return uint8_t(idx); // safe because of the check above
+ }
+
+ static uint8_t c2s( size_t c ) {
+ return inverse_meaning.at(c);
+ }
+
+
+
+ static inline uint8_t gap_cstate() {
+ return inverse_meaning.size() - 1;
+ }
+
+ static inline bool cstate_is_gap( uint8_t cs) {
+ return cs == gap_cstate();
+ }
+
+
+
+ static inline size_t num_cstates() {
+ return inverse_meaning.size();
+ }
+
+};
+
+
// FIXME: fake aa model
template<>
class model<tag_aa> {
View
4 stepwise_addition.cpp
@@ -17,6 +17,8 @@
* along with papara. If not, see <http://www.gnu.org/licenses/>.
*/
+// #include "pch.h"
+
#include "stepwise_align.h"
#include <functional>
#include <iomanip>
@@ -2341,7 +2343,7 @@ int main( int argc, char **argv ) {
std::map<std::string, std::vector<uint8_t> >out_msa1;
- sptr::shared_ptr<ln_pool> pool(new ln_pool(std::auto_ptr<node_data_factory>(new my_fact()) ));
+ sptr::shared_ptr<ln_pool> pool(new ln_pool(ln_pool::fact_ptr_type(new my_fact()) ));
lnode *last_tree;
{
View
1,853 stepwise_addition_pro.cpp
@@ -17,13 +17,18 @@
* along with papara. If not, see <http://www.gnu.org/licenses/>.
*/
+// #include "pch.h"
+
#include <cctype>
#include <algorithm>
#include <functional>
#include <vector>
#include <iostream>
#include <fstream>
+#include <iterator>
+
+#define BOOST_UBLAS_NDEBUG
#include <boost/thread.hpp>
#include <boost/bind.hpp>
@@ -33,29 +38,749 @@
#include "pairwise_seq_distance.h"
#include "stepwise_align.h"
#include "raxml_interface.h"
+#include "sequence_model.h"
+#include "pvec.h"
#include "ivymike/time.h"
#include "ivymike/getopt.h"
#include "ivymike/smart_ptr.h"
#include "ivymike/tree_parser.h"
#include "ivymike/tdmatrix.h"
#include "ivymike/algorithm.h"
#include "ivymike/tree_traversal_utils.h"
-
+#include "ivymike/tree_split_utils.h"
+#include "ivymike/flat_map.h"
+#include "ivymike/fasta.h"
namespace tree_parser = ivy_mike::tree_parser_ms;
using ivy_mike::tree_parser_ms::ln_pool;
using ivy_mike::tree_parser_ms::lnode;
+using sequence_model::tag_aa;
+using sequence_model::tag_dna;
+
+using ivy_mike::apply_lnode;
using ivy_mike::apply_lnode;
using ivy_mike::back_insert_ifer;
using ivy_mike::iterate_lnode;
using ivy_mike::rooted_bifurcation;
+using ivy_mike::back_insert_ifer;
+using ivy_mike::iterate_lnode;
+using ivy_mike::rooted_bifurcation;
+using ivy_mike::scoring_matrix;
+
namespace ublas = boost::numeric::ublas;
typedef std::vector<unsigned char> sequence;
+namespace {
+
+// double g_delta = log(0.1);
+// double g_epsilon = log(0.5);
+}
+
+class log_odds {
+public:
+
+ log_odds( double bg_prob ) : bg_prob_(bg_prob) {}
+
+ inline double operator()( double p ) {
+ return std::max( -100.0, log( p / bg_prob_ ));
+ }
+
+private:
+ const double bg_prob_;
+};
+
+
+static std::vector<uint8_t> gapstream_to_alignment( const std::vector<uint8_t> &gaps, const std::vector<uint8_t> &raw, uint8_t gap_char, bool upper ) {
+
+ std::vector<uint8_t> out;
+
+ std::vector<uint8_t>::const_reverse_iterator rit = raw.rbegin();
+
+
+ // 'gap indicator': if upper is set, insert gap into reference (=if gaps[i] == 2).
+ const uint8_t gap_ind = upper ? 2 : 1;
+
+
+
+ for ( std::vector<uint8_t>::const_iterator git = gaps.begin(); git != gaps.end(); ++git ) {
+
+
+
+ if ( *git == gap_ind ) {
+ out.push_back(gap_char);
+ } else {
+
+ if( rit != raw.rend() ) {
+ out.push_back(*rit);
+ ++rit;
+ } else {
+ out.push_back( 'X' );
+ }
+ }
+ }
+ if( rit != raw.rend() ) {
+
+ std::cerr << "too short tb: " << raw.rend() - rit << " upper: " << upper << "\n";
+ }
+ std::reverse(out.begin(), out.end());
+
+ return out;
+}
+
+class log_odds_viterbi {
+ typedef ublas::matrix<double> dmat;
+ typedef std::vector<double> dsvec;
+
+
+ // lof_t: log-odds-float = float type good enough to hold/calculate log-odds scores.
+ // 32bit float should be enough
+ typedef float lof_t;
+ typedef ublas::matrix<lof_t> lomat;
+
+ typedef std::vector<lof_t> losvec;
+
+public:
+ log_odds_viterbi( const dmat &state, const dmat &gap, boost::array<double,4> state_freq )
+ :
+ ref_state_prob_(state), ref_gap_prob_(gap), ref_len_(state.size2()),
+ state_freq_(state_freq),
+ neg_inf_( -std::numeric_limits<lof_t>::infinity() ),
+ m_(state.size2() + 1),
+ d_(state.size2() + 1),
+ i_(state.size2() + 1),
+ max_matrix_height_(0),
+ delta_(log(0.01)),
+ epsilon_(log(0.1))
+ {
+ ref_gap_prob_log_ = ref_gap_prob_;
+ {
+ auto &d = ref_gap_prob_log_.data();
+ std::transform( d.begin(), d.end(), d.begin(), log );
+
+
+ }
+
+// for( auto it1 = ref_gap_prob_.begin1(); it1 != ref_gap_prob_.end1(); ++it1 ) {
+// std::copy( it1.begin(), it1.end(), std::ostream_iterator<float>( std::cout, " " ));
+// std::cout << "\n";
+//
+// }
+
+ precalc_log_odds();
+ }
+
+ void setup( size_t qlen ) {
+ assert( ref_gap_prob_.size2() == ref_len_ );
+
+
+
+ // init first rows
+ std::fill( m_.begin(), m_.end(), 0.0 );
+
+
+ //std::fill( d_.begin(), d_.end(), 0.0 );
+ {
+ int i = 0; // TODO: still wearing my STL hat. Not really sure if std::generate + lambda is better than a for loop...
+ std::generate( d_.begin(), d_.end(), [&]() { return delta_ + (i++) * epsilon_; });
+
+ m_.assign( d_.begin(), d_.end() );
+
+ }
+ std::fill( i_.begin(), i_.end(), neg_inf_ );
+
+ std::fill( traceback_.begin1().begin(), traceback_.begin1().end(), tb_d_to_d | tb_m_to_d );
+ traceback_(0, 0) = tb_m_to_m;
+
+ // init first columns
+ m_[0] = 0.0;
+ d_[0] = neg_inf_;
+ i_[0] = delta_;
+
+
+
+ }
+
+
+ void precalc_log_odds() {
+ ref_state_lo_.resize( ref_state_prob_.size1(), ref_state_prob_.size2() );
+
+ for( size_t i = 0; i < 4; ++i ) {
+ const ublas::matrix_row<dmat> prow( ref_state_prob_, i );
+ ublas::matrix_row<lomat> lorow( ref_state_lo_, i );
+ std::transform( prow.begin(), prow.end(), lorow.begin(), log_odds(state_freq_[i]));
+ }
+
+
+
+// {
+// odds odds_ngap( 1 - g_gap_freq );
+// odds odds_gap( g_gap_freq );
+//
+// ref_ngap_odds_.resize(ref_gap_prob_.size());
+// ref_gap_odds_.resize(ref_gap_prob_.size());
+// for( size_t i = 0; i < ref_gap_prob_.size(); ++i ) {
+// ref_ngap_odds_[i] = odds_ngap(1 - ref_gap_prob_[i]);
+// ref_gap_odds_[i] = odds_gap( ref_gap_prob_[i] );
+// }
+// }
+ }
+
+ template<typename T, typename T2>
+ static inline std::pair<T,T2> max2( const T &a, const T &b, const T2 &a2, const T2 &b2 ) {
+ if( a > b ) {
+ return std::make_pair( a, a2 );
+ } else {
+ return std::make_pair( b, b2 );
+ }
+ }
+
+
+
+ template<typename T, typename T2>
+ static inline std::pair<T,T2> max3( const T &a, const T &b, const T &c, const T2 &a2, const T2 &b2, const T2 &c2, bool dump = false ) {
+ //return std::max( a, std::max( b, c ));
+ if( dump ) {
+ std::cout << "(" << a << " " << b << " " << c << ")";
+ }
+
+ if( a > b ) {
+ return max2( a, c, a2, c2 );
+ } else {
+ return max2( b, c, b2, c2 );
+ }
+ }
+
+
+
+
+
+ double align( const std::vector<uint8_t> &qs ) {
+ const size_t qlen = qs.size();
+
+ traceback_.resize(qlen + 1, m_.size(), false);
+
+ setup( qlen );
+
+ //dmat ref_state_trans = trans(ref_state_prob_);
+
+
+ assert( m_.size() == ref_len_ + 1 );
+
+ best_score_ = neg_inf_;
+ best_i_ = -1;
+ best_j_ = -1;
+ short last_state = 0;
+
+ for( size_t i = 1; i < qlen + 1; ++i ) {
+ const int b = qs[i-1];
+ // std::cout << "b: " << b << "\n";
+
+ //const double b_freq = state_freq_.at(b);
+ //const ublas::matrix_column<dmat> b_state( ref_state_prob_, b );
+ const ublas::matrix_row<lomat> b_state_lo( ref_state_lo_, b );
+
+ // const ublas::matrix_column<dmat> ngap_prob( ref_gap_prob_, 0 );
+ // const ublas::matrix_column<dmat> gap_prob( ref_gap_prob_, 1 );
+
+ i_[0] = delta_ + (i-1) * epsilon_;
+ m_[0] = i_[0];
+
+ lof_t diag_m = m_[0];
+ lof_t diag_d = d_[0];
+ lof_t diag_i = i_[0];
+
+ losvec::iterator m0 = m_.begin() + 1;
+ losvec::iterator d0 = d_.begin() + 1;
+ losvec::iterator i0 = i_.begin() + 1;
+
+ losvec::iterator m1 = m_.begin();
+ losvec::iterator d1 = d_.begin();
+ losvec::iterator i1 = i_.begin();
+ ublas::matrix_row<lomat>::const_iterator bsl = b_state_lo.begin();
+
+ ublas::matrix_row<ublas::matrix<short>> traceback_row( traceback_, i );
+ auto traceback_iter = traceback_row.begin();
+ *traceback_iter = tb_i_to_i | tb_m_to_i;
+ ++traceback_iter;
+
+// losvec::iterator rg = ref_gap_odds_.begin();
+// losvec::iterator rng = ref_ngap_odds_.begin();
+
+ auto gap_prob_log = ref_gap_prob_log_.begin2();
+
+ const losvec::iterator m_end = m_.end();
+
+ for( size_t j = 1; m0 != m_end; m1 = m0++, d1 = d0++, i1 = i0++, ++bsl, ++gap_prob_log, ++traceback_iter, ++j ) {
+ //ublas::matrix_row<dmat> a_state(ref_state_prob_, j-1 );
+ //ublas::matrix_row<dmat> a_gap(ref_gap_prob_, j-1 );
+
+ //double match_log_odds = log( b_state[j-1] / b_freq );
+ //lof_t match_log_odds = b_state_lo[j-1];
+ const lof_t match_log_odds = *bsl;
+
+
+
+ //lof_t gap_log_odds = ref_gap_lo_[j-1];
+ //lof_t ngap_log_odds = ref_ngap_lo_[j-1];
+// const lof_t gap_odds = *rg;
+// const lof_t ngap_odds = *rng;
+
+
+ lof_t p_ngap_log = *gap_prob_log;
+ lof_t p_gap_log = *(gap_prob_log.begin() + 1);
+
+// p_gap = std::max( p_gap, 0.01f );
+
+// lof_t m_log_sum = log(
+// exp(diag_m) * p_ngap
+// + exp(diag_d) * p_gap
+// + exp(diag_i)
+// );
+
+ // std::cout << "logp: " << log(p_gap) << " ";
+// std::cout << "\t";
+ auto m_log_max = max3<float>(
+ diag_m + p_ngap_log,
+ diag_d,
+ diag_i,
+ tb_m_to_m,
+ tb_m_to_d,
+ tb_m_to_i,
+ !true
+ );
+
+ diag_m = *m0;
+ *m0 = m_log_max.first + match_log_odds;
+ *traceback_iter = m_log_max.second; // do not or, because it's the first value tritten to the tb matrix
+// std::cout << *m0 << " ";
+
+#if 0
+ std::cout << i << " " << j << " " << m_(i,j) << " : " << m_(i-1, j-1) + ngap_log_odds
+ << " " << d_(i-1, j-1) + gap_log_odds << " " << i_(i-1, j-1) + gap_log_odds << " " << match_log_odds << " " << gap_log_odds << " " << ngap_log_odds << " max: " << m_max << "\n";
+#endif
+
+ diag_i = *i0;
+
+ // the two 'diags' have already been updated, so they're both actually containing the current 'aboves',
+ // which is exactly what we need to calculate the new i
+
+// lof_t i_log_sum = log(
+// exp(diag_m) /** delta_*/
+// + exp(diag_i) /** epsilon_*/
+// );
+ auto i_log_max = max3<float>(
+ diag_m + delta_,
+ diag_i + epsilon_,
+ diag_d + delta_,
+ tb_i_to_m,
+ tb_i_to_i,
+ tb_i_to_d
+ );
+
+
+ *i0 = i_log_max.first;
+ *traceback_iter |= i_log_max.second;
+
+#if 1
+// lof_t d_log_sum = log(
+// exp(*m1) /** delta_*/
+// + exp(*d1) /** epsilon_*/
+// );
+ auto d_log_max = max3<float>(
+ *m1 + delta_,
+ *d1 + epsilon_,
+ *i1 + delta_,
+ tb_d_to_m,
+ tb_d_to_d,
+ tb_d_to_i
+ );
+#else
+ lof_t d_log_sum = *m1 + math_approx::log(
+ delta_
+ + math_approx::exp(*d1 - *m1) * epsilon_
+ );
+#endif
+ diag_d = *d0;
+ *d0 = d_log_max.first + p_gap_log;
+ *traceback_iter |= d_log_max.second;
+
+ last_state = m_log_max.second;
+
+// if( m0 == m_end - 1 || i == qlen ) {
+// if( *m0 > best_score_ ) {
+// best_score_ = *m0;
+// best_i_ = i;
+// best_j_ = j;
+// best_state_ = m_log_max.second;
+// }
+//
+// }
+ //end_state_ = max3( *m0, *i0, *d0, tb_m_to_m, tb_m_to_i, tb_m_to_d );
+ //std::cout << end_state_.first << " ";
+// std::cout << match_log_odds << " ";
+ //lof_t old_m = m_[j];
+
+ }
+// std::cout << "\n";
+
+ }
+
+ //return m_.back();
+
+
+ best_score_ = m_.back();
+ best_i_ = qlen;
+ best_j_ = m_.size() - 1;
+ best_state_ = last_state;
+
+ return best_score_;
+ }
+
+
+ std::vector<uint8_t> traceback() {
+ std::cout << "end state: " << std::hex << int(end_state_.second) << std::dec << "\n";
+
+ int ia = traceback_.size1() - 1;
+ int ib = traceback_.size2() - 1;
+
+ int max_a = best_i_;
+ int max_b = best_j_;
+
+ assert( ia == max_a || ib == max_b );
+
+
+// for( auto it1 = traceback_.begin1(); it1 != traceback_.end1(); ++it1 ) {
+// std::cout << std::hex;
+// std::copy( it1.begin(), it1.end(), std::ostream_iterator<int>( std::cout, " " ));
+// std::cout << std::dec << "\n";
+//
+// }
+
+
+ bool in_l = false;
+ bool in_u = false;
+
+ std::vector<uint8_t> tb_out;
+ tb_out.reserve( ia + ib );
+
+ while( ia > max_a ) {
+ tb_out.push_back(2);
+ --ia;
+ }
+
+ while( ib > max_b ) {
+ tb_out.push_back(1);
+ --ib;
+ }
+
+ {
+ char tb_end = best_state_;
+
+ in_u = (tb_end & tb_m_to_i) != 0;
+ in_l = (tb_end & tb_m_to_d) != 0;
+
+ assert( !in_u || !in_l );
+ }
+
+ while( ia > 0 && ib > 0 ) {
+ auto tb = traceback_( ia, ib );
+// std::cout << std::hex;
+// std::cout << "tb: " << int(tb) << "\n";
+// std::cout << std::dec;
+ if( !in_l && !in_u ) {
+ in_l = (tb & tb_m_to_d) != 0;
+ in_u = (tb & tb_m_to_i) != 0;
+
+// if( !in_l && !in_u ) {
+//
+//
+// }
+ tb_out.push_back(0);
+ --ia;
+ --ib;
+ } else if( in_u ) {
+ tb_out.push_back(2);
+ --ia;
+
+ in_u = (tb & tb_i_to_i) != 0;
+ in_l = (tb & tb_i_to_d) != 0;
+ } else if( in_l ) {
+ tb_out.push_back(1);
+ --ib;
+
+ in_l = (tb & tb_d_to_d) != 0;
+ in_u = (tb & tb_d_to_i) != 0;
+ }
+
+
+ }
+
+ while( ia > 0 ) {
+ tb_out.push_back(2);
+ --ia;
+ }
+
+ while( ib > 0 ) {
+ tb_out.push_back(1);
+ --ib;
+ }
+
+
+ //return std::vector<char>( tb_out.rbegin(), tb_out.rend() );
+
+ return tb_out;
+ }
+private:
+ ublas::matrix<short> traceback_;
+ static const short tb_i_to_i;// = 0x1;
+ static const short tb_i_to_m;// = 0x2;
+ static const short tb_d_to_d;// = 0x4;
+ static const short tb_d_to_m;// = 0x8;
+ static const short tb_m_to_m;// = 0x10;
+ static const short tb_m_to_i;// = 0x20;
+ static const short tb_m_to_d;// = 0x40;
+ static const short tb_i_to_d;// = 0x80;
+ static const short tb_d_to_i;// = 0x100;
+
+ dmat ref_state_prob_;
+ dmat ref_gap_prob_;
+ dmat ref_gap_prob_log_;
+
+
+ lomat ref_state_lo_;
+// losvec ref_gap_odds_;
+// losvec ref_ngap_odds_;
+
+ const size_t ref_len_;
+ const boost::array<double,4> state_freq_;
+
+ const float neg_inf_;
+
+ losvec m_;
+ losvec d_;
+ losvec i_;
+
+ std::pair<lof_t, char> end_state_;
+
+ size_t max_matrix_height_;
+
+ const lof_t delta_;// = log(0.1);
+ const lof_t epsilon_;// = log(0.5);
+
+// size_t max_col_;
+// size_t max_row_;
+ double best_score_;
+ size_t best_i_;
+ size_t best_j_;
+ short best_state_;
+
+
+
+};
+
+const short log_odds_viterbi::tb_i_to_i = 0x1;
+const short log_odds_viterbi::tb_i_to_m = 0x2;
+const short log_odds_viterbi::tb_d_to_d = 0x4;
+const short log_odds_viterbi::tb_d_to_m = 0x8;
+const short log_odds_viterbi::tb_m_to_m = 0x10;
+const short log_odds_viterbi::tb_m_to_i = 0x20;
+const short log_odds_viterbi::tb_m_to_d = 0x40;
+const short log_odds_viterbi::tb_i_to_d = 0x80;
+const short log_odds_viterbi::tb_d_to_i = 0x100;
+
+namespace {
+
+template<class pvec_t,typename seq_tag>
+class my_adata_gen : public ivy_mike::tree_parser_ms::adata {
+
+
+public:
+// int m_ct;
+ my_adata_gen() {
+
+// std::cout << "my_adata\n";
+
+ }
+
+ virtual ~my_adata_gen() {
+
+// std::cout << "~my_adata\n";
+
+ }
+
+
+
+ void init_sequence( const sequence &seq ) {
+ assert( seq_.empty() );
+
+ seq_.assign( seq.begin(), seq.end() );
+ }
+
+ void update_pvec() {
+ pvec_.init2( seq_, sequence_model::model<seq_tag>() );
+ }
+
+ pvec_t &pvec() {
+ return pvec_;
+ }
+
+ const ublas::matrix<float> &calculate_anc_gap_probs() {
+
+ const auto &pgap = pvec_.get_pgap();
+
+ anc_probs_.resize(pgap.size1(), pgap.size2());
+
+
+ auto oit = anc_probs_.begin2();
+ for( auto iit = pgap.begin2(); iit != pgap.end2(); ++iit, ++oit ) {
+ double v1 = *iit * (1-pvec_pgap::pgap_model->gap_freq());
+ double v2 = *(iit.begin() + 1) * pvec_pgap::pgap_model->gap_freq();