Skip to content

Commit

Permalink
added phy_cut_partition for cutting sindividual partitions out of lar…
Browse files Browse the repository at this point in the history
…ge phylip files.
  • Loading branch information
sim82 committed Nov 20, 2012
1 parent a722f68 commit 41f271e
Show file tree
Hide file tree
Showing 10 changed files with 531 additions and 44 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Expand Up @@ -43,19 +43,22 @@ set( ALL_HEADERS )
ENDIF() ENDIF()





ADD_LIBRARY( papara_core STATIC papara.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp sequence_model.cpp align_utils.cpp blast_partassign.cpp ) ADD_LIBRARY( papara_core STATIC papara.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp sequence_model.cpp align_utils.cpp blast_partassign.cpp )


# add_executable(papara_nt main.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp ${ALL_HEADERS}) # add_executable(papara_nt main.cpp pvec.cpp pars_align_seq.cpp pars_align_gapp_seq.cpp parsimony.cpp ${ALL_HEADERS})
add_executable(papara papara2_main.cpp ${ALL_HEADERS}) add_executable(papara papara2_main.cpp ${ALL_HEADERS})
add_executable(fasta_random_sample2 fasta_random_sample2.cpp ${ALL_HEADERS}) add_executable(fasta_random_sample2 fasta_random_sample2.cpp ${ALL_HEADERS})
add_executable(fasta_to_phy fasta_to_phy.cpp ${ALL_HEADERS}) add_executable(fasta_to_phy fasta_to_phy.cpp ${ALL_HEADERS})
add_executable(phy_to_fasta phy_to_fasta.cpp ${ALL_HEADERS}) add_executable(phy_to_fasta phy_to_fasta.cpp ${ALL_HEADERS})
add_executable(phy_cut_partition phy_cut_partition.cpp ${ALL_HEADERS})


# add_executable(pw_dist pw_dist.cpp pairwise_seq_distance.cpp ) # add_executable(pw_dist pw_dist.cpp pairwise_seq_distance.cpp )


# target_link_libraries(papara_nt ivymike ublas_jama ${SYSDEP_LIBS} ) # target_link_libraries(papara_nt ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(papara papara_core ivymike ublas_jama ${SYSDEP_LIBS} ) target_link_libraries(papara papara_core ivymike ublas_jama ${SYSDEP_LIBS} )
target_link_libraries(phy_to_fasta ivymike ${SYSDEP_LIBS} ) target_link_libraries(phy_to_fasta ivymike ${SYSDEP_LIBS} )
target_link_libraries(phy_cut_partition papara_core ublas_jama ivymike ${SYSDEP_LIBS} )
# target_link_libraries(pw_dist ivymike ${BOOST_LIBS} ${SYSDEP_LIBS}) # target_link_libraries(pw_dist ivymike ${BOOST_LIBS} ${SYSDEP_LIBS})




Expand Down
23 changes: 23 additions & 0 deletions blast_partassign.cpp
Expand Up @@ -152,6 +152,7 @@ partition partassign::next_partition( std::istream &is ) {
partition part; partition part;
part.start = from_string<int>(start_str) - 1; part.start = from_string<int>(start_str) - 1;
part.end = from_string<int>(end_str) - 1; part.end = from_string<int>(end_str) - 1;
part.gene_name = gene_name;
return part; return part;
} }


Expand Down Expand Up @@ -350,6 +351,28 @@ std::vector<std::pair<size_t,size_t> > resolve_qs_bounds( references<pvec_t,seq_
return bounds; return bounds;
} }



std::pair<size_t,size_t> partition_bounds( std::istream &is, const std::string &name ) {
std::pair<size_t,size_t> bounds(-1,-1);

while ( is.good() ) {

partition p = partassign::next_partition ( is );
if ( p.start == -1 ) { // returning a partition with negaitve indices is next_partition's way of signalling EOF
break;
}

// std::cout << "gene: " << p.gene_name << "\n";

if( p.gene_name == name ) {
return std::pair<size_t,size_t>(p.start, p.end);
}

}
return std::pair<size_t,size_t>(-1,-1);

}

// combinatorial explosion hazard ahead... if another function comes along put it into a driver class just like papara::driver. // combinatorial explosion hazard ahead... if another function comes along put it into a driver class just like papara::driver.


template std::vector<std::pair<size_t,size_t> > resolve_qs_bounds<pvec_cgap,sequence_model::tag_aa>( references<pvec_cgap,sequence_model::tag_aa> &refs, queries<sequence_model::tag_aa> &qs, const partassign::part_assignment &part_assign ); template std::vector<std::pair<size_t,size_t> > resolve_qs_bounds<pvec_cgap,sequence_model::tag_aa>( references<pvec_cgap,sequence_model::tag_aa> &refs, queries<sequence_model::tag_aa> &qs, const partassign::part_assignment &part_assign );
Expand Down
4 changes: 4 additions & 0 deletions blast_partassign.h
Expand Up @@ -39,6 +39,7 @@ class partition {
partition() : start(-1), end(-1) {} partition() : start(-1), end(-1) {}
int start; int start;
int end; int end;
std::string gene_name;
}; };




Expand Down Expand Up @@ -77,5 +78,8 @@ template<typename pvec_t, typename seq_tag>
std::vector<std::pair<size_t,size_t> > resolve_qs_bounds( papara::references<pvec_t,seq_tag> &refs, papara::queries<seq_tag> &qs, const partassign::part_assignment &part_assign ); std::vector<std::pair<size_t,size_t> > resolve_qs_bounds( papara::references<pvec_t,seq_tag> &refs, papara::queries<seq_tag> &qs, const partassign::part_assignment &part_assign );




std::pair<size_t,size_t> partition_bounds( std::istream &is, const std::string &name );


} }
#endif #endif
245 changes: 223 additions & 22 deletions fasta_random_sample2.cpp
Expand Up @@ -24,20 +24,60 @@
#include "ivymike/disable_shit.h" #include "ivymike/disable_shit.h"
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#include <numeric>
#include <iterator> #include <iterator>

#include <cmath>




#include "ivymike/fasta.h" #include "ivymike/fasta.h"
#include "stepwise_align.h" #include "stepwise_align.h"


void process_fasta( std::istream &is )
typedef std::vector<uint8_t> primer_sequence;

std::vector<primer_sequence> load_primers( const std::string &primer_name ) {
std::vector<primer_sequence> pl;

if( primer_name.empty() ) {
return pl;
}

std::ifstream is( primer_name.c_str() );

if( !is.good() ) {
throw std::runtime_error( "cannot open primer file" );
}

std::vector<std::string> names;

ivy_mike::read_fasta( is, names, pl );

return pl;


}


std::vector<uint8_t> to_vec( const std::string &s ) {
return std::vector<uint8_t>( s.begin(), s.end() );
}

std::vector<primer_sequence> builtin_primers() {
std::vector<primer_sequence> pl;

pl.push_back( to_vec( "CTTGGTCATTTAGAGGAAGT" ) );
pl.push_back( to_vec( "CGATGAAGAACGCAG" ) );

return pl;
}

void process_fasta( std::istream &is, const std::string &primer_name )
{ {


std::vector<primer_sequence> primers = builtin_primers();
std::vector<std::string> primers; // std::vector<std::string> primers = load_primers( primer_name );
primers.push_back("CTTGGTCATTTAGAGGAAGT"); // primers.push_back("CTTGGTCATTTAGAGGAAGT");
primers.push_back("CGATGAAGAACGCAG"); // primers.push_back("CGATGAAGAACGCAG");


std::vector<std::string> names; std::vector<std::string> names;
std::vector<std::vector<uint8_t> > data; std::vector<std::vector<uint8_t> > data;
Expand Down Expand Up @@ -69,14 +109,15 @@ void process_fasta( std::istream &is )






std::vector<uint8_t> seq = data.at(r); const std::vector<uint8_t> &seq = data.at(r);






size_t num_rejects = 0; size_t num_rejects = 0;
for( size_t j = 0; j < primers.size(); ++j ) { for( size_t j = 0; j < primers.size(); ++j ) {
std::vector<uint8_t> p( primers[j].begin(), primers[j].end() ); // std::vector<uint8_t> p( primers[j].begin(), primers[j].end() ); // align_freeshift overwrites the input sequences (i.e., in/out parameters)
float score = align_freeshift( sm, seq, p, -5, -2, false ); const std::vector<uint8_t> &p = primers[j];
float score = align_freeshift_score( sm, seq, p, -5, -2, false );
float escore = match_score * p.size(); float escore = match_score * p.size();
if( score < 0.9 * escore ) { if( score < 0.9 * escore ) {
++num_rejects; ++num_rejects;
Expand All @@ -97,10 +138,154 @@ void process_fasta( std::istream &is )


++num_written; ++num_written;
} }
if( num_written == 0 ) {
throw std::runtime_error( "none of the input sequences were selected. bailing out." );
}

}


void process_fasta_weighted( std::istream &is, const std::string &primer_name )
{

std::vector<primer_sequence> primers = builtin_primers();
//std::vector<primer_sequence> primers = load_primers( primer_name );
// primers.push_back("CTTGGTCATTTAGAGGAAGT");
// primers.push_back("CGATGAAGAACGCAG");

std::vector<std::string> names;
std::vector<std::vector<uint8_t> > data;

ivy_mike::read_fasta( is, names, data, false );

int baseweight = 1000;
std::vector<int> weights( data.size(), baseweight );

const size_t n = names.size();
assert( n == data.size());

std::vector<size_t> rind;
for( size_t i = 0; i < names.size(); ++i ) {
rind.push_back(i);
}

std::random_shuffle( rind.begin(), rind.end() );

const size_t target_size = 10;

size_t num_written = 0;




int match_score = 5;
ivy_mike::scoring_matrix sm(5, -3);


for( size_t i = 0; i < n; ++i ) {
// calculate sequence weighting based on primer matches

const std::vector<uint8_t> &seq = data.at(i);

for( size_t j = 0; j < primers.size(); ++j ) {
const std::vector<uint8_t> &p = primers[j];
float score = align_freeshift_score( sm, seq, p, -5, -2, false );
float escore = match_score * p.size();
// use 'fixed point' arithmetics for the weigths
weights[i] += floor((score / escore) * baseweight * 10);
}

// std::cout << "weight: " << i << " " << weights[i] << "\n";
}


while( num_written < target_size && !data.empty() ) {
// random selection: each remaining sequence forms a bin/interval sized according to the weights.
// The weigths are integers (approximating fixed-point weights), to prevent float/roundoff weirdness.

int weight_sum = std::accumulate( weights.begin(), weights.end(), 0 );

int r = rand() % weight_sum;

int cur_end = 0;
size_t sel = size_t(-1);
size_t n_remain = names.size();


// select the bin into which the random value r falls
for( sel = 0; sel < n_remain; ++sel ) {
cur_end += weights[sel];

if( cur_end > r ) {
break;
}

}

if( sel >= n_remain ) {
throw std::runtime_error( "bad math error (i.e., I'm too stupid to get simple integer math right)" );
}

std::cerr << "============== select " << names.at(sel) << " weight " << weights.at(sel) << std::endl;
std::cout << ">" << names.at(sel) << "\n";

std::copy( data.at(sel).begin(), data.at(sel).end(), std::ostream_iterator<char>(std::cout) );
std::cout << std::endl;


// erase selected sequence (i.e, random sampling without replacement)
names.erase( names.begin() + sel );
data.erase( data.begin() + sel );
weights.erase( weights.begin() + sel );

++num_written;


}




// while( num_written < target_size && !rind.empty()) {
// size_t r = rind.back();
// rind.pop_back();
//
//
//
// std::vector<uint8_t> seq = data.at(r);
//
//
//
// size_t num_rejects = 0;
// for( size_t j = 0; j < primers.size(); ++j ) {
// std::vector<uint8_t> p( primers[j].begin(), primers[j].end() ); // align_freeshift overwrites the input sequences (i.e., in/out parameters)
// //const std::vector<uint8_t> &p = primers[j];
// float score = align_freeshift( sm, seq, p, -5, -2, false );
// float escore = match_score * p.size();
// if( score < 0.9 * escore ) {
// ++num_rejects;
// }
//
// // std::cout << "score " << j << ": " << score << " " << p.size() * match_score << "\n";
// }
//
// if( num_rejects != 0 ) {
// // std::cout << "reject!\n";
// continue;
// }
//
// std::cout << ">" << names.at(r) << "\n";
//
// std::copy( data.at(r).begin(), data.at(r).end(), std::ostream_iterator<char>(std::cout) );
// std::cout << "\n";
//
// ++num_written;
// }
if( num_written == 0 ) {
throw std::runtime_error( "none of the input sequences were selected. bailing out." );
}

} }



int main( int argc, char *argv[] ) { int main( int argc, char *argv[] ) {




Expand All @@ -111,21 +296,37 @@ int main( int argc, char *argv[] ) {


unsigned int seed = atoi( argv[1] ); unsigned int seed = atoi( argv[1] );
std::cerr << "rand seed: " << seed << std::endl; std::cerr << "rand seed: " << seed << std::endl;
srand(seed);


if( argc == 2 ) { std::string primer_name;
process_fasta( std::cin ); if( argc > 3 ) {
} else { primer_name = argv[3];
for( int i = 2; i < argc; ++i ) { }
std::ifstream is(argv[i] );

srand(seed);
if( !is.good() ) { {
std::cerr << "WARNING: cannot read: " << argv[i] << "\n"; std::ifstream is( argv[2] );
continue;
} if( !is.good() ) {
std::cerr << "WARNING: cannot read: " << argv[2] << "\n";


process_fasta( is );
} }
}
//process_fasta_weighted( is, primer_name );
process_fasta( is, primer_name );
}
// if( argc == 2 ) {
// process_fasta_weighted( std::cin, primer_name );
// } else {
// for( int i = 2; i < argc; ++i ) {
// std::ifstream is( argv[i] );
//
// if( !is.good() ) {
// std::cerr << "WARNING: cannot read: " << argv[i] << "\n";
// continue;
// }
//
// process_fasta_weighted( is, primer_name );
// }
// }
} }


0 comments on commit 41f271e

Please sign in to comment.