Skip to content

Commit

Permalink
updating realignment code
Browse files Browse the repository at this point in the history
  • Loading branch information
vibansal committed Dec 5, 2018
1 parent 9ec56fc commit d67822e
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 160 deletions.
97 changes: 68 additions & 29 deletions hairs-src/estimate_hmm_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,42 @@ int BTI[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

typedef struct // probabilities are in logspace
{
int states; // match, insertion,deletion
double** TRS; // transition probabilities between states
double** MEM; // probability of A->C, A->G (match state)
double match; double mismatch; double insertion; double deletion; // emission probs
} Align_Params;

extern Align_Params *AP;

// allocate memory and initializes
Align_Params* init_params()
{
int i=0,j=0;
Align_Params *AP = malloc(sizeof(Align_Params));
(*AP).states = 3;
(*AP).TRS = calloc((*AP).states,sizeof(double*)); // match =0, ins =1, del = 2
for (i=0;i<AP->states;i++) AP->TRS[i] = calloc(sizeof(double),AP->states);
// initialize alignment parameters data structure
AP->match = log(0.979); AP->mismatch = log(0.007);
AP->deletion = log(1); AP->insertion =log(1);
AP->TRS[0][0] = log(0.879); AP->TRS[0][1] = log(0.076); AP->TRS[0][2] = log(0.045);
AP->TRS[1][0] = log(0.865); AP->TRS[1][1] = log(0.135);
AP->TRS[2][0] = log(0.730); AP->TRS[2][2] = log(0.27);
AP->MEM = calloc(4,sizeof(double*)); for (i=0;i<4;i++) AP->MEM[i] = calloc(4,sizeof(double));
for (i=0;i<4;i++)
{
for (j=0;j<4;j++)
{
if (i==j) AP->MEM[i][j] = AP->match; else AP->MEM[i][j] = AP->mismatch;
}
}
return AP;
}

// for every HP of length 'k', tabulate # of times read-correctly and indel error

void estimate_counts_read(struct alignedread* read,REFLIST* reflist,int* emission_counts,int* trans_counts,int* indel_lengths)
{
Expand Down Expand Up @@ -52,12 +87,25 @@ void estimate_counts_read(struct alignedread* read,REFLIST* reflist,int* emissio
}
if (op == BAM_CDEL)
{
/*
for (t=-5;t<0;t++) fprintf(stderr,"%c",reflist->sequences[current][read->position+l2+t-1]);
fprintf(stderr,"|"); for (t=0;t<l;t++) fprintf(stderr,"%c",reflist->sequences[current][read->position+l2+t-1]);
fprintf(stderr,"|"); for (t=l;t<l+10;t++) fprintf(stderr,"%c",reflist->sequences[current][read->position+l2+t-1]);
fprintf(stderr," DEL %d %d\n",l,l2+read->position-1);
*/

if (prevop == BAM_CMATCH || prevop == BAM_CEQUAL || prevop == BAM_CDIFF) trans_counts[0*3 + 2] +=1;
if (l < 10) trans_counts[2*3+2] +=l-1;
if (l < 20) indel_lengths[l] +=1;
}
if (op == BAM_CINS)
{
/*
for (t=-5;t<0;t++) fprintf(stderr,"%c",reflist->sequences[current][read->position+l2+t-1]);
fprintf(stderr,"|"); for (t=0;t<l;t++) fprintf(stderr,"%c",read->sequence[l1+t]);
fprintf(stderr,"|"); for (t=l;t<l+10;t++) fprintf(stderr,"%c",reflist->sequences[current][read->position+l2+t-1]);
fprintf(stderr," INS %d %d\n",l,read->position+l2-1);
*/
if (prevop == BAM_CMATCH || prevop == BAM_CEQUAL || prevop == BAM_CDIFF) trans_counts[0*3 + 1] +=1;
if (l < 10) trans_counts[1*3+1] +=l-1;
if (l < 20) indel_lengths[l+20] +=1;
Expand All @@ -69,7 +117,7 @@ void estimate_counts_read(struct alignedread* read,REFLIST* reflist,int* emissio
}
}

void print_error_params(int* emission_counts,int* trans_counts,int* indel_lengths)
void print_error_params(int* emission_counts,int* trans_counts,int* indel_lengths,Align_Params* AP)
{
char ITB[] = {'A','C','G','T'};
char state[] = {'M','I','D'};
Expand All @@ -78,26 +126,34 @@ void print_error_params(int* emission_counts,int* trans_counts,int* indel_length
{
int total =0.01;
for (b2=0;b2<4;b2++) total += emission_counts[b1*4+b2];
for (b2=0;b2<4;b2++) fprintf(stdout,"%c -> %c %0.4f | ",ITB[b1],ITB[b2],(float)emission_counts[b1*4+b2]/total);
fprintf(stdout,"\n");
for (b2=0;b2<4;b2++)
{
fprintf(stderr,"%c -> %c %0.4f | ",ITB[b1],ITB[b2],(float)emission_counts[b1*4+b2]/total);
AP->MEM[b1][b2] = log((float)emission_counts[b1*4+b2]/total);
}
fprintf(stderr,"\n");
}
for (b1=0;b1<3;b1++)
{
int total =0.01;
for (b2=0;b2<3;b2++) total += trans_counts[b1*3+b2];
for (b2=0;b2<3;b2++) fprintf(stdout,"%c -> %c %0.4f | ",state[b1],state[b2],(float)trans_counts[b1*3+b2]/total);
fprintf(stdout,"\n");
for (b2=0;b2<3;b2++)
{
fprintf(stderr,"%c -> %c %0.4f | ",state[b1],state[b2],(float)trans_counts[b1*3+b2]/total);
AP->TRS[b1][b2] = log((float)trans_counts[b1*3+b2]/total);
}
fprintf(stderr,"\n");
}
//for (pr=0;pr<16;pr++) fprintf(stdout,"%c:%c %d \n",ITB[pr/4],ITB[pr%4],emission_counts[pr]);
fprintf(stdout,"emission for match state \nreads transition counts \n");
fprintf(stderr,"emission for match state \nreads transition counts \n");
//for (pr=0;pr<9;pr++) fprintf(stdout,"%c:%c %d \n",state[pr/3],state[pr%3],trans_counts[pr]);
for (pr=0;pr<10;pr++) fprintf(stdout,"%d del: %d ins: %d \n",pr,indel_lengths[pr],indel_lengths[pr+20]);
for (pr=0;pr<10;pr++) fprintf(stderr,"%d del: %d ins: %d \n",pr,indel_lengths[pr],indel_lengths[pr+20]);

}


// reflist only has sequences for contigs in VCF, so this can cause segfault
int realignment_params(char* bamfile,REFLIST* reflist,char* regions)
int realignment_params(char* bamfile,REFLIST* reflist,char* regions,Align_Params* AP)
{
struct alignedread* read = (struct alignedread*) malloc(sizeof (struct alignedread));
int i=0;
Expand Down Expand Up @@ -128,9 +184,9 @@ int realignment_params(char* bamfile,REFLIST* reflist,char* regions)
if (newregion[i] == ':') break;
}
newregion[i+1] = '\0';
fprintf(stderr,"newrion %s %s \n",regions,newregion);
//fprintf(stderr,"newrion %s %s \n",regions,newregion);
if ((idx = bam_index_load(bamfile)) ==0) { fprintf(stderr,"unable to load bam index for file %s\n",bamfile); return -1; }
bam_parse_region(fp->header,regions,&ref,&beg,&end);
bam_parse_region(fp->header,newregion,&ref,&beg,&end);
if (ref < 0) { fprintf(stderr,"invalid region for bam file %s \n",regions); return -1; }
iter = bam_iter_query(idx,ref,beg,end);
}
Expand Down Expand Up @@ -172,25 +228,8 @@ int realignment_params(char* bamfile,REFLIST* reflist,char* regions)
prevtid = read->tid;
}
fprintf(stderr,"using %d reads to estimate realignment parameters for HMM \n",reads);
if (reads > 1000) print_error_params(emission_counts,trans_counts,indel_lengths);
if (reads > 1000) print_error_params(emission_counts,trans_counts,indel_lengths,AP);

bam_destroy1(b); samclose(fp); free(newregion);
}


// call function to estimate alignment parameters using a small number of reads, after that reset bam reading to go back and start again
/*
if (REALIGN_VARIANTS ==1 && readsused < 1000)
{
estimate_params(read,reflist,emission_counts,trans_counts,indel_lengths); readsused++;
if (readsused == 1000)
{
print_error_params(emission_counts,trans_counts,indel_lengths);
b = bam_init1();
if (regions != NULL) iter = bam_iter_query(idx,ref,beg,end);
}
prevchrom = chrom; prevtid = read->tid; free_readmemory(read);
continue;
}
*/

}
48 changes: 9 additions & 39 deletions hairs-src/extracthairs.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
// author: Vikas Bansal, vbansal@scripps, 2011-2012
// author: Vikas Bansal
// program to extract haplotype informative reads from sorted BAM file, input requirements: bamfile and variantfile
// Jan 13 2012, changed to read directly from BAM file
// paired-end overlapping reads need to be handled properly
// add module for RNA-seq data as well
// add flag for secondary alignments and PCR duplicates (picard mark duplicates)
// input format for variants should be VCF from now
//#include "extracthairs.h"
#include<stdio.h>
#include<stdlib.h>
Expand Down Expand Up @@ -77,13 +72,13 @@ int PRINT_COMPACT = 1; // 1= print each fragment block by block, 0 = print varia

//int get_chrom_name(struct alignedread* read,HASHTABLE* ht,REFLIST* reflist);

#include "estimate_hmm_params.c"
#include "realign_pairHMM.c" // added 11/29/2018
#include "parsebamread.c"
#include "realignbamread.c"
#include "fosmidbam_hairs.c" // code for parsing fosmid pooled sequence data
#include "estimate_hmm_params.c"

Align_Params AP; // global alignmnet params
Align_Params* AP; // global alignmnet params

//disabled sam file reading
//#include "samhairs.c" // has two functions that handle sam file parsing
Expand Down Expand Up @@ -133,11 +128,9 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
int reads = 0;
struct alignedread* read = (struct alignedread*) malloc(sizeof (struct alignedread));

if (REALIGN_VARIANTS) realignment_params(bamfile,reflist,regions); // estimate alignment parameters from BAM file ONT/pacbio reads only 12/3/18
if (REALIGN_VARIANTS) realignment_params(bamfile,reflist,regions,AP); // estimate alignment parameters from BAM file ONT/pacbio reads only 12/3/18

if (REALIGN_VARIANTS){
fcigarlist = calloc(sizeof(int),400000);
}
if (REALIGN_VARIANTS) fcigarlist = calloc(sizeof(int),400000);
int i = 0;
int chrom = 0; //int sl=0;
// int v1,v2;
Expand Down Expand Up @@ -271,13 +264,12 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
bam_destroy1(b);

free(flist); free(read); free(fragment.alist);
if (REALIGN_VARIANTS){
free(fcigarlist);
}
if (REALIGN_VARIANTS) free(fcigarlist);
return 0;
}

int main(int argc, char** argv) {

char samfile[1024];
char bamfile[1024];
char variantfile[1024];
Expand All @@ -292,7 +284,7 @@ int main(int argc, char** argv) {
sampleid[0] = '-';
sampleid[1] = '\0';
int samplecol = 10; // default if there is a single sample in the VCF file
int i = 0, variants = 0, hetvariants = 0;
int i = 0, variants = 0, hetvariants = 0,j=0;
char** bamfilelist = NULL;
int bamfiles = 0;

Expand All @@ -304,16 +296,7 @@ int main(int argc, char** argv) {
exit(1);
}

AP.states = 3;
AP.TRS = calloc(sizeof(double*),AP.states); // match =0, ins =1, del = 2
for (i=0;i<AP.states;i++) AP.TRS[i] = calloc(sizeof(double),AP.states);
// initialize alignment parameters data structure
AP.match = log(0.979); AP.mismatch = log(0.007); AP.deletion = log(1); AP.insertion =log(1);
AP.TRS[0][0] = log(0.879); AP.TRS[0][1] = log(0.076); AP.TRS[0][2] = log(0.045);
AP.TRS[1][0] = log(0.865); AP.TRS[1][1] = log(0.135);
AP.TRS[2][0] = log(0.730); AP.TRS[2][2] = log(0.27);
/*
*/
AP = init_params();

for (i = 1; i < argc; i += 2) {
if (strcmp(argv[i], "--bam") == 0 || strcmp(argv[i], "--bamfile") == 0) bamfiles++;
Expand Down Expand Up @@ -516,19 +499,6 @@ int main(int argc, char** argv) {
free(chromvars[i].intervalmap);
}
free(chromvars);
/*
for (i=0;i<ht.htsize;i++){
if (ht.blist[i] != NULL){
for(j=0;j<ht.bucketlengths[i];j++){
free(ht.blist[i]->key);
}
free(ht.blist[i]);
}
}
free(ht.blist);
free(ht.bucketlengths);
*/
free(sampleid); free(varlist); free(reflist);
if (bamfiles > 0 && strcmp(variantfile,"None") !=0){
for (i=0;i<bamfiles;i++)
Expand Down
Loading

0 comments on commit d67822e

Please sign in to comment.