diff --git a/CMakeLists.txt b/CMakeLists.txt index fc41303..573e0c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,10 @@ endif() # this is a hack: only add subdirectories if IVYMIKE_INCLUDE is not already set. # IVYMIKE_INCLUDE is abused as a flag indicating that papara should be build in 'library-only' mode... -#add_subdirectory(ivy_mike) + +if( NOT PAPARA_NO_IVYMIKE ) + add_subdirectory(ivy_mike) +endif() # add_subdirectory(genassign_blast) # ENDIF() @@ -66,14 +69,16 @@ add_executable(papara papara2_main.cpp ${ALL_HEADERS}) add_executable(fasta_random_sample2 fasta_random_sample2.cpp ${ALL_HEADERS}) add_executable(fasta_to_phy fasta_to_phy.cpp ${ALL_HEADERS}) add_executable(phy_to_fasta phy_to_fasta.cpp ${ALL_HEADERS}) +add_executable(phy_megamerge phy_megamerge.cpp ${ALL_HEADERS}) target_link_libraries(papara papara_core ivymike ublas_jama ${SYSDEP_LIBS} ) target_link_libraries(phy_to_fasta ivymike ${SYSDEP_LIBS} ) +target_link_libraries(phy_megamerge ivymike ${SYSDEP_LIBS} ) -#add_executable(phy_cut_partition phy_cut_partition.cpp ${ALL_HEADERS}) -#target_link_libraries(phy_cut_partition papara_core ublas_jama ivymike ${SYSDEP_LIBS} ) +add_executable(phy_cut_partition phy_cut_partition.cpp ${ALL_HEADERS}) +target_link_libraries(phy_cut_partition papara_core ublas_jama ivymike ${SYSDEP_LIBS} ) if( USE_CPP11 ) @@ -86,3 +91,4 @@ if( USE_CPP11 ) target_link_libraries(dump_anc_probs ivymike ublas_jama ${BOOST_LIBS} ${SYSDEP_LIBS} PocoFoundation) endif() +install_targets( /bin papara ) diff --git a/build_papara2.sh b/build_papara2.sh index 56a6eea..bb4c88e 100644 --- a/build_papara2.sh +++ b/build_papara2.sh @@ -5,7 +5,7 @@ -g++ -o papara -O3 -msse4.1 -I. -I ivy_mike/src/ -I ublasJama-1.0.2.3 papara.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp sequence_model.cpp papara2_main.cpp blast_partassign.cpp align_utils.cpp ivy_mike/src/time.cpp ivy_mike/src/tree_parser.cpp ivy_mike/src/getopt.cpp ivy_mike/src/demangle.cpp ivy_mike/src/multiple_alignment.cpp ublasJama-1.0.2.3/EigenvalueDecomposition.cpp -lpthread +g++ -o papara -O3 -msse4a -I. -I ivy_mike/src/ -I ublasJama-1.0.2.3 papara.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp sequence_model.cpp papara2_main.cpp blast_partassign.cpp align_utils.cpp ivy_mike/src/time.cpp ivy_mike/src/tree_parser.cpp ivy_mike/src/getopt.cpp ivy_mike/src/demangle.cpp ivy_mike/src/multiple_alignment.cpp ublasJama-1.0.2.3/EigenvalueDecomposition.cpp -lpthread #-I/usr/include/boost141/ diff --git a/ivy_mike b/ivy_mike index 3e75a03..8a000ce 160000 --- a/ivy_mike +++ b/ivy_mike @@ -1 +1 @@ -Subproject commit 3e75a03d06966170e0128825e9811d3be6da87c6 +Subproject commit 8a000ce6906839c294b67c97bdb3d6fb0a80c91d diff --git a/papara.cpp b/papara.cpp index 53f3ca2..b3e81fd 100644 --- a/papara.cpp +++ b/papara.cpp @@ -226,6 +226,11 @@ add_log_sink::~add_log_sink() { ls_buf.remove_sink(s_); } + +std::string papara::get_version_string() { + return std::string( "2.3" ); +} + template queries::queries( const std::string &opt_qs_name ) { @@ -442,7 +447,7 @@ references::references(const char* opt_tree_name, const char* op { //std::cerr << "papara_nt instantiated as: " << typeid(*this).name() << "\n"; - lout << "papara_nt instantiated as: " << ivy_mike::demangle(typeid(*this).name()) << "\n"; + lout << "references container instantiated as: " << ivy_mike::demangle(typeid(*this).name()) << "\n"; @@ -1008,6 +1013,7 @@ void driver::calc_scores(size_t n_threads, const my_references& // ivy_mike::timer t1; ivy_mike::thread_group tg; + lout << "papara_core version " << papara::get_version_string() << std::endl; lout << "start scoring, using " << n_threads << " threads" << std::endl; typedef worker worker_t; diff --git a/papara.h b/papara.h index 6662232..a7a0bd5 100644 --- a/papara.h +++ b/papara.h @@ -1073,7 +1073,7 @@ bool file_exists(const char *filename) } // end anonymous namespace for inline util functions (move to impl file!) - +std::string get_version_string(); } // end of namespace papara #endif diff --git a/papara2_main.cpp b/papara2_main.cpp index fab605d..02dfc27 100644 --- a/papara2_main.cpp +++ b/papara2_main.cpp @@ -37,7 +37,7 @@ void print_banner( std::ostream &os ) { os << "| __/ _` | __/ _` | // _` |\n"; os << "| | | (_| | | | (_| | |\\ \\ (_| |\n"; os << "\\_| \\__,_\\_| \\__,_\\_| \\_\\__,_|\n"; - os << " Version 2.3\n"; + os << " papara_core version " << papara::get_version_string() << "\n"; } @@ -215,11 +215,15 @@ void run_papara( const std::string &qs_name, const std::string &alignment_name, } +void print_commandline( std::ostream &os, char **argv, int argc ) { -int main( int argc, char *argv[] ) { + std::stringstream ss; + std::copy( argv, argv + argc, std::ostream_iterator( os, " " ) ); + os << std::endl; +} -// aligned_buffer xxx(1024); - + +int main( int argc, char *argv[] ) { namespace igo = ivy_mike::getopt; @@ -355,7 +359,8 @@ int main( int argc, char *argv[] ) { papara::add_log_tee papara_log_file( logs ); - + papara::lout << "papara called as:\n"; + print_commandline( papara::lout, argv, argc ); const bool ref_gaps = !opt_no_ref_gaps; diff --git a/phy_cut_partition.cpp b/phy_cut_partition.cpp index 99e1327..3f78d17 100644 --- a/phy_cut_partition.cpp +++ b/phy_cut_partition.cpp @@ -56,7 +56,8 @@ int main( int argc, char *argv[] ) { std::ostream &os = std::cout; size_t name_width = lp.max_name_len() + 1; - + + os << lp.size() << " " << col_max - col_min << "\n"; for( int i = 0, size = lp.size(); i < size; ++i ) { std::string name = lp.name_at(i); @@ -74,4 +75,4 @@ int main( int argc, char *argv[] ) { } -} \ No newline at end of file +} diff --git a/phy_megamerge.cpp b/phy_megamerge.cpp new file mode 100644 index 0000000..f4ba393 --- /dev/null +++ b/phy_megamerge.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2013 Simon A. Berger + * + * This file is part of papara. + * + * papara is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * papara is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with papara. If not, see . + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ivymike/large_phylip.h" + + +void collect_name_prefixes( std::map > > &name_prefixes, std::set &non_prefixed_names, const ivy_mike::large_phylip &lp, size_t lp_num ) { + + const size_t lp_size = lp.size(); + + for( size_t i = 0; i < lp_size; ++i ) { + std::string name = lp.name_at(i); + + if( name.find("motu_cluster_") == 0 ) { + size_t epos = name.find( "." ); + + assert( epos != name.npos ); + std::string prefix = name.substr(0, epos); + + //name_prefixes.insert( prefix ); + std::vector > &location_list = name_prefixes[prefix]; + + location_list.push_back(std::make_pair( lp_num, i ) ); + } else { + non_prefixed_names.insert( name ); + } + } +} + + +std::string merge_nongap_columns( const std::vector &phys, const std::vector > &matches ) { + const size_t seq_len = phys.front()->sequence_len(); + + std::string out_seq; + + + out_seq.reserve( seq_len ); + + + + for( size_t i = 0; i < seq_len; ++i ) { + char c = '-'; + + + for( std::vector< std::pair< size_t, size_t > >::const_iterator it = matches.begin(); it != matches.end(); ++it ) { + const ivy_mike::large_phylip &lp = *phys.at( it->first ); + char *base_ptr = (char *)lp.sequence_begin_at( it->second ); + + if( base_ptr[i] != '-' ) { + if( c != '-' ) { + std::cerr << "meeeeeeeeeep: conflict!" << std::endl; + throw std::runtime_error( "bailing out\n" ); + } + + c = base_ptr[i]; + } + + } + + out_seq.push_back( c ); + + } + + return out_seq; +} + +int main( int argc, char *argv[] ) { + std::vector phys; + std::map > > name_prefixes; + std::set non_prefixed_names; + + for( size_t i = 1; i < size_t(argc); ++i ) { + phys.push_back( new ivy_mike::large_phylip(argv[i])); + + collect_name_prefixes( name_prefixes, non_prefixed_names, *phys.back(), phys.size() - 1 ); + } + + size_t max_name_len = 0; + + std::cout << "non prefix: " << non_prefixed_names.size() << "\n"; + for( std::set< std::string >::iterator it = non_prefixed_names.begin(); it != non_prefixed_names.end(); ++it ) { + max_name_len = std::max( max_name_len, it->size() ); + } + + + + for( std::map< std::string, std::vector< std::pair< size_t, size_t > > >::iterator it = name_prefixes.begin(); it != name_prefixes.end(); ++it ) { + max_name_len = std::max( max_name_len, it->first.size() ); + + std::cout << it->first << ":"; + for( std::vector< std::pair< size_t, size_t > >::iterator it2 = it->second.begin(); it2 != it->second.end(); ++it2 ) { + std::cout << " " << it2->first << "/" << it2->second; + } + + std::cout << "\n"; + } + + std::ofstream os( "merged.phy" ); + + + + { + ivy_mike::large_phylip &temp = *phys.front(); + + os << non_prefixed_names.size() + name_prefixes.size() << " " << temp.sequence_len() << "\n"; + + for( std::set< std::string >::iterator it = non_prefixed_names.begin(); it != non_prefixed_names.end(); ++it ) { + size_t idx = temp.getIdx( it->c_str() ); + + os << std::setw( max_name_len + 1 ) << std::left << *it; + std::copy( temp.sequence_begin_at(idx), temp.sequence_end_at(idx), std::ostream_iterator(os) ); + os << "\n"; + } + } + + for( std::map< std::string, std::vector< std::pair< size_t, size_t > > >::iterator it = name_prefixes.begin(); it != name_prefixes.end(); ++it ) { + os << std::setw( max_name_len + 1 ) << std::left << it->first; + + std::string merged_seq = merge_nongap_columns( phys, it->second ); + std::copy( merged_seq.begin(), merged_seq.end(), std::ostream_iterator(os) ); + os << "\n"; + } + +} \ No newline at end of file diff --git a/stepwise_align.h b/stepwise_align.h index e85491b..d828185 100644 --- a/stepwise_align.h +++ b/stepwise_align.h @@ -790,7 +790,7 @@ class pvec_aligner_vec { size_t block_start_outer = a_start_idx; //ptr_block_outer.s_iter = s_.base(); //ptr_block_outer.si_iter = si_.base(); - + //std::cerr << "sm_inc: " << &(*sm_inc_prof_.begin()) << std::endl; ticks ticks1 = getticks(); @@ -841,14 +841,16 @@ class pvec_aligner_vec { //aiter a_aux_end_this = ptr_block.a_aux_prof_iter + W * block_width; - score_t * __restrict s_iter = s_.base(); - score_t * __restrict si_iter = si_.base(); + score_t * s_iter = s_.base(); + score_t * si_iter = si_.base(); // score_t * __restrict a_aux_prof_iter = &(*(a_aux_start + block_start * W)); // score_t * __restrict a_aux_prof_end = &(*(a_aux_start + block_end * W)); - score_t * __restrict sm_inc_iter = &(*(sm_inc_prof_.begin() + (*it_b) * av_size_all + block_start * W)); - score_t * __restrict sm_inc_end = &(*(sm_inc_prof_.begin() + (*it_b) * av_size_all + block_end * W)); + score_t * sm_inc_iter = &(*(sm_inc_prof_.begin() + (*it_b) * av_size_all + block_start * W)); + score_t * sm_inc_end = &(*(sm_inc_prof_.begin() + (*it_b) * av_size_all + block_end * W)); + + - _mm_prefetch( (const char *)sm_inc_iter, _MM_HINT_T0 ); + // _mm_prefetch( (const char *)sm_inc_iter, _MM_HINT_T0 ); vec_t last_sdiag = vu::load( &(*block_sdiag_it)); diff --git a/vec_unit.h b/vec_unit.h index 1f92d70..8c83a08 100644 --- a/vec_unit.h +++ b/vec_unit.h @@ -422,12 +422,26 @@ struct vector_unit { return _mm_max_epi32( a, b ); #else //#warning "probably untested code!" - assert(0); const vec_t ma = _mm_cmpgt_epi32( a, b ); - return _mm_or_si128( _mm_and_si128( ma, a ), _mm_andnot_si128( ma, b ) ); + const vec_t ret = _mm_or_si128( _mm_and_si128( ma, a ), _mm_andnot_si128( ma, b ) ); +#if 0 + println( a ); + println( b ); + println( ret ); + + assert(0); +#endif + return ret; #endif } - + + static inline void println( const vec_t & v ) { + + T tmp[W]; + _mm_storeu_si128( (vec_t*)tmp, v ); + printf( "%d %d %d %d\n", tmp[0], tmp[1], tmp[2], tmp[3] ); + } + static inline const vec_t abs_diff( const vec_t &a, const vec_t &b ) { // i don't really like this function, as ideally this would just be abs(sub(a,b)), // but there doesn't seem to be a fast way to implement abs on pre SSSSSSE3.