Permalink
Browse files

added phy_cut_partition for cutting sindividual partitions out of lar…

…ge phylip files.
  • Loading branch information...
1 parent a722f68 commit 41f271e44aac1300d3fa28911797b42cc09fc364 @sim82 committed Nov 20, 2012
Showing with 531 additions and 44 deletions.
  1. +3 −0 CMakeLists.txt
  2. +23 −0 blast_partassign.cpp
  3. +4 −0 blast_partassign.h
  4. +223 −22 fasta_random_sample2.cpp
  5. +1 −1 ivy_mike
  6. +7 −5 papara.cpp
  7. +9 −4 papara.h
  8. +39 −9 papara2_main.cpp
  9. +83 −0 phy_cut_partition.cpp
  10. +139 −3 stepwise_align.h
View
@@ -43,19 +43,22 @@ set( ALL_HEADERS )
ENDIF()
+
ADD_LIBRARY( papara_core STATIC papara.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp sequence_model.cpp align_utils.cpp blast_partassign.cpp )
# add_executable(papara_nt main.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp ${ALL_HEADERS})
add_executable(papara papara2_main.cpp ${ALL_HEADERS})
add_executable(fasta_random_sample2 fasta_random_sample2.cpp ${ALL_HEADERS})
add_executable(fasta_to_phy fasta_to_phy.cpp ${ALL_HEADERS})
add_executable(phy_to_fasta phy_to_fasta.cpp ${ALL_HEADERS})
+add_executable(phy_cut_partition phy_cut_partition.cpp ${ALL_HEADERS})
# add_executable(pw_dist pw_dist.cpp pairwise_seq_distance.cpp )
# target_link_libraries(papara_nt ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(papara papara_core ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(phy_to_fasta ivymike ${SYSDEP_LIBS} )
+target_link_libraries(phy_cut_partition papara_core ublas_jama ivymike ${SYSDEP_LIBS} )
# target_link_libraries(pw_dist ivymike ${BOOST_LIBS} ${SYSDEP_LIBS})
View
@@ -152,6 +152,7 @@ partition partassign::next_partition( std::istream &is ) {
partition part;
part.start = from_string<int>(start_str) - 1;
part.end = from_string<int>(end_str) - 1;
+ part.gene_name = gene_name;
return part;
}
@@ -350,6 +351,28 @@ std::vector<std::pair<size_t,size_t> > resolve_qs_bounds( references<pvec_t,seq_
return bounds;
}
+
+std::pair<size_t,size_t> partition_bounds( std::istream &is, const std::string &name ) {
+ std::pair<size_t,size_t> bounds(-1,-1);
+
+ while ( is.good() ) {
+
+ partition p = partassign::next_partition ( is );
+ if ( p.start == -1 ) { // returning a partition with negaitve indices is next_partition's way of signalling EOF
+ break;
+ }
+
+// std::cout << "gene: " << p.gene_name << "\n";
+
+ if( p.gene_name == name ) {
+ return std::pair<size_t,size_t>(p.start, p.end);
+ }
+
+ }
+ return std::pair<size_t,size_t>(-1,-1);
+
+}
+
// combinatorial explosion hazard ahead... if another function comes along put it into a driver class just like papara::driver.
template std::vector<std::pair<size_t,size_t> > resolve_qs_bounds<pvec_cgap,sequence_model::tag_aa>( references<pvec_cgap,sequence_model::tag_aa> &refs, queries<sequence_model::tag_aa> &qs, const partassign::part_assignment &part_assign );
View
@@ -39,6 +39,7 @@ class partition {
partition() : start(-1), end(-1) {}
int start;
int end;
+ std::string gene_name;
};
@@ -77,5 +78,8 @@ template<typename pvec_t, typename seq_tag>
std::vector<std::pair<size_t,size_t> > resolve_qs_bounds( papara::references<pvec_t,seq_tag> &refs, papara::queries<seq_tag> &qs, const partassign::part_assignment &part_assign );
+std::pair<size_t,size_t> partition_bounds( std::istream &is, const std::string &name );
+
+
}
#endif
View
@@ -24,20 +24,60 @@
#include "ivymike/disable_shit.h"
#include <iostream>
#include <algorithm>
+#include <numeric>
#include <iterator>
-
+#include <cmath>
#include "ivymike/fasta.h"
#include "stepwise_align.h"
-void process_fasta( std::istream &is )
+
+typedef std::vector<uint8_t> primer_sequence;
+
+std::vector<primer_sequence> load_primers( const std::string &primer_name ) {
+ std::vector<primer_sequence> pl;
+
+ if( primer_name.empty() ) {
+ return pl;
+ }
+
+ std::ifstream is( primer_name.c_str() );
+
+ if( !is.good() ) {
+ throw std::runtime_error( "cannot open primer file" );
+ }
+
+ std::vector<std::string> names;
+
+ ivy_mike::read_fasta( is, names, pl );
+
+ return pl;
+
+
+}
+
+
+std::vector<uint8_t> to_vec( const std::string &s ) {
+ return std::vector<uint8_t>( s.begin(), s.end() );
+}
+
+std::vector<primer_sequence> builtin_primers() {
+ std::vector<primer_sequence> pl;
+
+ pl.push_back( to_vec( "CTTGGTCATTTAGAGGAAGT" ) );
+ pl.push_back( to_vec( "CGATGAAGAACGCAG" ) );
+
+ return pl;
+}
+
+void process_fasta( std::istream &is, const std::string &primer_name )
{
-
- std::vector<std::string> primers;
- primers.push_back("CTTGGTCATTTAGAGGAAGT");
- primers.push_back("CGATGAAGAACGCAG");
+ std::vector<primer_sequence> primers = builtin_primers();
+// std::vector<std::string> primers = load_primers( primer_name );
+// primers.push_back("CTTGGTCATTTAGAGGAAGT");
+// primers.push_back("CGATGAAGAACGCAG");
std::vector<std::string> names;
std::vector<std::vector<uint8_t> > data;
@@ -69,14 +109,15 @@ void process_fasta( std::istream &is )
- std::vector<uint8_t> seq = data.at(r);
+ const std::vector<uint8_t> &seq = data.at(r);
size_t num_rejects = 0;
for( size_t j = 0; j < primers.size(); ++j ) {
- std::vector<uint8_t> p( primers[j].begin(), primers[j].end() );
- float score = align_freeshift( sm, seq, p, -5, -2, false );
+// std::vector<uint8_t> p( primers[j].begin(), primers[j].end() ); // align_freeshift overwrites the input sequences (i.e., in/out parameters)
+ const std::vector<uint8_t> &p = primers[j];
+ float score = align_freeshift_score( sm, seq, p, -5, -2, false );
float escore = match_score * p.size();
if( score < 0.9 * escore ) {
++num_rejects;
@@ -97,10 +138,154 @@ void process_fasta( std::istream &is )
++num_written;
}
+ if( num_written == 0 ) {
+ throw std::runtime_error( "none of the input sequences were selected. bailing out." );
+ }
+
+}
+
+
+void process_fasta_weighted( std::istream &is, const std::string &primer_name )
+{
+
+ std::vector<primer_sequence> primers = builtin_primers();
+ //std::vector<primer_sequence> primers = load_primers( primer_name );
+// primers.push_back("CTTGGTCATTTAGAGGAAGT");
+// primers.push_back("CGATGAAGAACGCAG");
+
+ std::vector<std::string> names;
+ std::vector<std::vector<uint8_t> > data;
+
+ ivy_mike::read_fasta( is, names, data, false );
+
+ int baseweight = 1000;
+ std::vector<int> weights( data.size(), baseweight );
+
+ const size_t n = names.size();
+ assert( n == data.size());
+
+ std::vector<size_t> rind;
+ for( size_t i = 0; i < names.size(); ++i ) {
+ rind.push_back(i);
+ }
+
+ std::random_shuffle( rind.begin(), rind.end() );
+
+ const size_t target_size = 10;
+
+ size_t num_written = 0;
+ int match_score = 5;
+ ivy_mike::scoring_matrix sm(5, -3);
+
+
+ for( size_t i = 0; i < n; ++i ) {
+ // calculate sequence weighting based on primer matches
+
+ const std::vector<uint8_t> &seq = data.at(i);
+
+ for( size_t j = 0; j < primers.size(); ++j ) {
+ const std::vector<uint8_t> &p = primers[j];
+ float score = align_freeshift_score( sm, seq, p, -5, -2, false );
+ float escore = match_score * p.size();
+ // use 'fixed point' arithmetics for the weigths
+ weights[i] += floor((score / escore) * baseweight * 10);
+ }
+
+// std::cout << "weight: " << i << " " << weights[i] << "\n";
+ }
+
+
+ while( num_written < target_size && !data.empty() ) {
+ // random selection: each remaining sequence forms a bin/interval sized according to the weights.
+ // The weigths are integers (approximating fixed-point weights), to prevent float/roundoff weirdness.
+
+ int weight_sum = std::accumulate( weights.begin(), weights.end(), 0 );
+
+ int r = rand() % weight_sum;
+
+ int cur_end = 0;
+ size_t sel = size_t(-1);
+ size_t n_remain = names.size();
+
+
+ // select the bin into which the random value r falls
+ for( sel = 0; sel < n_remain; ++sel ) {
+ cur_end += weights[sel];
+
+ if( cur_end > r ) {
+ break;
+ }
+
+ }
+
+ if( sel >= n_remain ) {
+ throw std::runtime_error( "bad math error (i.e., I'm too stupid to get simple integer math right)" );
+ }
+
+ std::cerr << "============== select " << names.at(sel) << " weight " << weights.at(sel) << std::endl;
+ std::cout << ">" << names.at(sel) << "\n";
+
+ std::copy( data.at(sel).begin(), data.at(sel).end(), std::ostream_iterator<char>(std::cout) );
+ std::cout << std::endl;
+
+
+ // erase selected sequence (i.e, random sampling without replacement)
+ names.erase( names.begin() + sel );
+ data.erase( data.begin() + sel );
+ weights.erase( weights.begin() + sel );
+
+ ++num_written;
+
+
+ }
+
+
+
+
+// while( num_written < target_size && !rind.empty()) {
+// size_t r = rind.back();
+// rind.pop_back();
+//
+//
+//
+// std::vector<uint8_t> seq = data.at(r);
+//
+//
+//
+// size_t num_rejects = 0;
+// for( size_t j = 0; j < primers.size(); ++j ) {
+// std::vector<uint8_t> p( primers[j].begin(), primers[j].end() ); // align_freeshift overwrites the input sequences (i.e., in/out parameters)
+// //const std::vector<uint8_t> &p = primers[j];
+// float score = align_freeshift( sm, seq, p, -5, -2, false );
+// float escore = match_score * p.size();
+// if( score < 0.9 * escore ) {
+// ++num_rejects;
+// }
+//
+// // std::cout << "score " << j << ": " << score << " " << p.size() * match_score << "\n";
+// }
+//
+// if( num_rejects != 0 ) {
+// // std::cout << "reject!\n";
+// continue;
+// }
+//
+// std::cout << ">" << names.at(r) << "\n";
+//
+// std::copy( data.at(r).begin(), data.at(r).end(), std::ostream_iterator<char>(std::cout) );
+// std::cout << "\n";
+//
+// ++num_written;
+// }
+ if( num_written == 0 ) {
+ throw std::runtime_error( "none of the input sequences were selected. bailing out." );
+ }
+
}
+
int main( int argc, char *argv[] ) {
@@ -111,21 +296,37 @@ int main( int argc, char *argv[] ) {
unsigned int seed = atoi( argv[1] );
std::cerr << "rand seed: " << seed << std::endl;
- srand(seed);
- if( argc == 2 ) {
- process_fasta( std::cin );
- } else {
- for( int i = 2; i < argc; ++i ) {
- std::ifstream is(argv[i] );
-
- if( !is.good() ) {
- std::cerr << "WARNING: cannot read: " << argv[i] << "\n";
- continue;
- }
+ std::string primer_name;
+ if( argc > 3 ) {
+ primer_name = argv[3];
+ }
+
+ srand(seed);
+ {
+ std::ifstream is( argv[2] );
+
+ if( !is.good() ) {
+ std::cerr << "WARNING: cannot read: " << argv[2] << "\n";
- process_fasta( is );
}
- }
+
+ //process_fasta_weighted( is, primer_name );
+ process_fasta( is, primer_name );
+ }
+// if( argc == 2 ) {
+// process_fasta_weighted( std::cin, primer_name );
+// } else {
+// for( int i = 2; i < argc; ++i ) {
+// std::ifstream is( argv[i] );
+//
+// if( !is.good() ) {
+// std::cerr << "WARNING: cannot read: " << argv[i] << "\n";
+// continue;
+// }
+//
+// process_fasta_weighted( is, primer_name );
+// }
+// }
}
Oops, something went wrong.

0 comments on commit 41f271e

Please sign in to comment.