Skip to content

Commit

Permalink
1. updated pHMM based realignment function (from longshot)
Browse files Browse the repository at this point in the history
2. sum of all paths is now default for pacbio/ont reads
3. extracthairs checks if the input VCF has extension '.gz' and fails if that is the case
  • Loading branch information
vibansal committed Mar 20, 2019
1 parent 9ba82da commit e3c291e
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 12 deletions.
2 changes: 1 addition & 1 deletion hairs-src/estimate_hmm_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ void print_error_params(int* emission_counts,int* trans_counts,int* indel_length
fprintf(stderr,"\n");
}
//for (pr=0;pr<16;pr++) fprintf(stdout,"%c:%c %d \n",ITB[pr/4],ITB[pr%4],emission_counts[pr]);
fprintf(stderr,"emission for match state \nreads transition counts \n");
//fprintf(stderr,"emission for match state \nreads transition counts \n");
//for (pr=0;pr<9;pr++) fprintf(stdout,"%c:%c %d \n",state[pr/3],state[pr%3],trans_counts[pr]);
//for (pr=0;pr<10;pr++) fprintf(stderr,"%d del: %d ins: %d \n",pr,indel_lengths[pr],indel_lengths[pr+20]);

Expand Down
18 changes: 13 additions & 5 deletions hairs-src/extracthairs.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ int PARSEINDELS = 0;
int SINGLEREADS = 0;
int LONG_READS = 0;
int REALIGN_VARIANTS = 0;
int ESTIMATE_PARAMS = 0; // estimate realignment parameters from BAM
int ESTIMATE_PARAMS = 1; // estimate realignment parameters from BAM
//int QVoffset = 33; declared in samread.h
FILE* logfile;
int PFLAG = 1;
Expand Down Expand Up @@ -111,8 +111,8 @@ void print_options() {
fprintf(stderr, "--ref <FILENAME> : reference sequence file (in fasta format, gzipped is okay), optional but required for indels, should be indexed\n");
fprintf(stderr, "--out <FILENAME> : output filename for haplotype fragments, if not provided, fragments will be output to stdout\n");
fprintf(stderr, "--region <chr:start-end> : chromosome and region in BAM file, useful to process individual chromosomes or genomic regions \n\n");
fprintf(stderr, "--sumall <0/1> : set to 1 to use sum of all local alignments approach (only with long reads), default = 0 \n\n");
fprintf(stderr, "--ep <0/1> : set to 1 to estimate HMM parameters from aligned reads (only with long reads), default = 0 \n\n");
//fprintf(stderr, "--sumall <0/1> : set to 1 to use sum of all local alignments approach (only with long reads), default = 0 \n\n");
fprintf(stderr, "--ep <0/1> : set to 1 to estimate HMM parameters from aligned reads (only with long reads), default = 1 \n\n");
}

void check_input_0_or_1(char* x){
Expand Down Expand Up @@ -306,6 +306,12 @@ int main(int argc, char** argv) {
else if (strcmp(argv[i], "--VCF") == 0 || strcmp(argv[i], "--vcf") == 0) {
strcpy(variantfile, argv[i + 1]);
VCFformat = 1;
int l=strlen(variantfile);
if (variantfile[l-1] == 'z' && variantfile[l-2] == 'g' && variantfile[l-3] == '.')
{
fprintf(stderr,"The input VCF file appears to be gzipped (.gz extension), hapcut only accepts unzipped VCF files as input\n Please provide an unzipped VCF file or make sure that the file doesn't have the .gz extension\n\n");
exit(0);
}
} else if (strcmp(argv[i], "--sorted") == 0){
check_input_0_or_1(argv[i + 1]);
readsorted = atoi(argv[i + 1]);
Expand Down Expand Up @@ -337,7 +343,8 @@ int main(int argc, char** argv) {
}else if (strcmp(argv[i], "--pacbio") == 0 || strcmp(argv[i], "--SMRT") == 0 || strcmp(argv[i],"--pb") ==0){
check_input_0_or_1(argv[i + 1]);
if (atoi(argv[i + 1])){
REALIGN_VARIANTS = 1; PACBIO =1; MINQ = 10;
REALIGN_VARIANTS = 1; PACBIO =1; MINQ = 7;
SUM_ALL_ALIGN = 1;
}

// scores based on https://www.researchgate.net/figure/230618348_fig1_Characterization-of-Pacific-Biosciences-dataa-Base-error-mode-rate-for-deletions
Expand All @@ -353,7 +360,7 @@ int main(int argc, char** argv) {
}else if (strcmp(argv[i], "--ont") == 0 || strcmp(argv[i], "--ONT") == 0){
check_input_0_or_1(argv[i + 1]);
if (atoi(argv[i + 1])){
REALIGN_VARIANTS = 1; MINQ = 10;
REALIGN_VARIANTS = 1; MINQ = 7;
}
// scores based on http://www.nature.com/nmeth/journal/v12/n4/abs/nmeth.3290.html
MATCH = log10(1.0 - (0.051 + 0.049 + 0.078));
Expand All @@ -362,6 +369,7 @@ int main(int argc, char** argv) {
INSERTION_EXTEND = log10(0.25); // this number has no basis in anything
DELETION_OPEN = log10(0.078);
DELETION_EXTEND = log10(0.25); // this number also has no basis in anything
SUM_ALL_ALIGN = 1;

}else if (strcmp(argv[i], "--verbose") == 0 || strcmp(argv[i], "--v") == 0){
check_input_0_or_1(argv[i + 1]);
Expand Down
27 changes: 21 additions & 6 deletions hairs-src/realign_pairHMM.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,14 @@ double sum_all_alignments_logspace(char* v, char* w,Align_Params* params, int mi
}
middle_prev[0] = 0.0;

if (ALLOW_END_GAPS ==1)
//if (ALLOW_END_GAPS ==1)
{
upper_prev[1] = params->lTRS[0][2];
for (j=2;j<lw+1;j++) upper_prev[j] = upper_prev[j-1] + params->lTRS[2][2];
for (j=2;j<lw+1;j++)
{
upper_prev[j] = upper_prev[j-1] + params->lTRS[2][2];
middle_prev[j] = log0;
}
}

for (i=1;i<lv;i++)
Expand All @@ -64,7 +68,7 @@ double sum_all_alignments_logspace(char* v, char* w,Align_Params* params, int mi
int band_end = lw;
if (band_middle + band_width/2 <= lw) band_end = band_middle + band_width/2;

if (ALLOW_END_GAPS ==1)
//if (ALLOW_END_GAPS ==1)
{
if (band_start ==1)
{
Expand Down Expand Up @@ -138,10 +142,14 @@ double sum_all_alignments_fast(char* v, char* w,Align_Params* params, int min_ba
}
middle_prev[0] = 1.0;

if (ALLOW_END_GAPS ==1)
//if (ALLOW_END_GAPS ==1)
{
upper_prev[1] = params->TRS[0][2];
for (j=2;j<lw+1;j++) upper_prev[j] = upper_prev[j-1] * params->TRS[2][2];
for (j=2;j<lw+1;j++)
{
upper_prev[j] = upper_prev[j-1] * params->TRS[2][2];
middle_prev[j] =log0;
}
}

for (i=1;i<lv;i++) // main loop
Expand All @@ -152,7 +160,7 @@ double sum_all_alignments_fast(char* v, char* w,Align_Params* params, int min_ba
int band_end = lw;
if (band_middle + band_width/2 <= lw) band_end = band_middle + band_width/2;

if (ALLOW_END_GAPS ==1)
//if (ALLOW_END_GAPS ==1)
{
if (band_start ==1)
{
Expand Down Expand Up @@ -191,6 +199,13 @@ double sum_all_alignments_fast(char* v, char* w,Align_Params* params, int min_ba
{
upper_prev[j] = upper_curr[j]; middle_prev[j] = middle_curr[j]; lower_prev[j] = lower_curr[j];
}
if (band_start >=2)
{
upper_prev[band_start-2] = -1000000;
middle_prev[band_start-2] = -1000000;
lower_prev[band_start-2] = -1000000; // NaN
}

upper_curr[band_start]= log0; middle_curr[band_start] = log0; lower_curr[band_start] = log0;
}
double ll;
Expand Down

0 comments on commit e3c291e

Please sign in to comment.