Merge branch 'master' into realignment.

Conflicts: hairs-src/extracthairs.c utilities/README.md utilities/calculate_haplotype_statistics.py
shilpagarg · Mar 20, 2019 · 5a6f5df · 5a6f5df
2 parents e3c291e + 1ba3e87
commit 5a6f5df
Show file tree

Hide file tree

Showing 10 changed files with 512 additions and 292 deletions.
diff --git a/README.md b/README.md
@@ -45,7 +45,9 @@ sudo make uninstall-hapcut2
 ## Input:
 HapCUT2 requires the following input:
 - BAM file for an individual containing reads aligned to a reference genome
-- VCF file containing **diploid** SNVs for the individual with respect to the reference
+- VCF file containing **diploid** SNVs (and short indel calls) for the individual with respect to the reference genome
+
+**Note: the program does not accept gzipped VCF files**
 
 ## To Run:
 
@@ -103,7 +105,9 @@ Important note: flag "--split_blocks 1" (compute switch error confidence and aut
 Field 11 is useful for controlling mismatch (single SNV) haplotype errors, similarly to field 9. The default behavior of HapCUT2 is to prune individual SNVs for which this confidence is less than 6.98 (probability of error 0.2), as these are highly likely to be errors.
 
 ## 10X Genomics Linked-Reads
-10X Genomics Linked Reads require an extra step to link short reads together into barcoded molecules:
+10X Genomics Linked Reads require an extra step to link short reads together into barcoded molecules.
+
+NOTE: It is required that the BAM reads have the BX (corrected barcode) tag.
 
 (1) use extractHAIRS to convert BAM file to the compact fragment file format containing only haplotype-relevant information. This is a necessary precursor step to running HapCUT2.
 ```
@@ -118,6 +122,8 @@ python3 utilities/LinkFragments.py --bam reads.sorted.bam --VCF variants.VCF --f
 ./build/HAPCUT2 --nf 1 --fragments linked_fragment_file --VCF variantcalls.vcf --output haplotype_output_file
 ```
 
+It is assumed that reads with the same barcode occurring within 20 kb of another belong to the same molecule, and reads separated by more than this distance are assigned to separate molecules. This distance can be controlled using the ```-d``` parameter in LinkFragments.
+
 ## Hi-C (Proximity Ligation) Sequencing Reads
 
 For improved haplotype accuracy with Hi-C reads, use the --HiC 1 option for both extractHAIRS and HapCUT2 steps.

diff --git a/hairs-src/extracthairs.c b/hairs-src/extracthairs.c
@@ -1,4 +1,3 @@
-// author: Vikas Bansal
 //  program to extract haplotype informative reads from sorted BAM file, input requirements: bamfile and variantfile
 //#include "extracthairs.h"
 #include<stdio.h>
@@ -180,7 +179,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
         }
         // find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome
         if (read->tid != prevtid) {
-            chrom = getindex(ht, read->chrom); // this will return -1 if the contig name is not  in the VCF file 
+            chrom = getindex(ht, read->chrom); // this will return -1 if the contig name is not  in the VCF file
 	    if (chrom < 0) fprintf(stderr,"chrom \"%s\" not in VCF file, skipping all reads for this chrom.... \n",read->chrom);
 	    else fprintf(stderr,"processing reads mapped to chrom \"%s\" \n",read->chrom);
 		// doing this for every read, can replace this by string comparison ..april 4 2012
@@ -198,6 +197,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
                 }
             }
         } else chrom = prevchrom;
+        //if (chrom_missing_index ==1) { prevtid = read->tid; free_readmemory(read); continue; }
 
         fragment.absIS = (read->IS < 0) ? -1 * read->IS : read->IS;
         // add check to see if the mate and its read are on same chromosome, bug for contigs, july 16 2012
@@ -256,8 +256,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
         prevtid = read->tid;
         free_readmemory(read);
     } // end of main while loop
-
-    if (fragments > 0)  // still fragments left in buffer 
+    if (fragments > 0)  // still fragments left in buffer
     {
         fprintf(stderr, "final cleanup of fragment list: %d current chrom %s %d prev %d \n", fragments, read->chrom, read->position,prevchrom);
         if (prevchrom >=0) clean_fragmentlist(flist, &fragments, varlist, -1, read->position, prevchrom); // added extra filter 03/08/18
@@ -425,6 +424,10 @@ int main(int argc, char** argv) {
         fprintf(stderr, "\nERROR: In order to realign variants (including --pacbio and --ont options), reference fasta file must be provided with --ref option.\n");
         exit(1);
     }
+    if (MINQ < 4) {
+      fprintf(stderr, "\nERROR: MINQ must be at least 4.\n");
+      exit(1);
+    }
     if (bamfiles > 0 && strcmp(variantfile, "None") != 0) {
         bamfilelist = (char**) malloc(sizeof (char*)*bamfiles);
         for (i = 0; i < bamfiles; i++) bamfilelist[i] = (char*) malloc(1024);

diff --git a/hapcut2-src/readinputfiles.c b/hapcut2-src/readinputfiles.c
@@ -2,7 +2,7 @@
 #include "common.h"
 #include <assert.h>
 
-extern int HOMOZYGOUS_PRIOR;
+extern float HOMOZYGOUS_PRIOR;
 extern int NEW_FRAGFILE_FORMAT;
 
 int fragment_compare(const void *a, const void *b) {
@@ -217,9 +217,9 @@ int count_variants_vcf(char* vcffile) {
 int read_vcffile(char* vcffile, struct SNPfrags* snpfrag, int snps) {
     char buffer[500000];
     char temp[1000];
-    char GQ[5];
+    char GQ[100];
     char* gen;
-    int i = 0, j = 0, k=0, len=0, s = 0, e = 0, var = 0, GQ_ix, format_ix;
+    int i = 0, j = 0, k=0, s = 0, e = 0, var = 0, GQ_ix, format_ix;
     FILE* fp = fopen(vcffile, "r");
     while (fgets(buffer, 500000, fp)) {
         if (buffer[0] == '#') continue;
@@ -307,29 +307,47 @@ int read_vcffile(char* vcffile, struct SNPfrags* snpfrag, int snps) {
         snpfrag[var].genotypes = (char*) malloc(e - s + 1);
         for (j = s; j < e; j++) snpfrag[var].genotypes[j - s] = buffer[j];
         snpfrag[var].genotypes[j - s] = '\0';
-        len = j-s;
+        //len = j-s;
         gen = snpfrag[var].genotypes; //  for convenience
+        snpfrag[var].homozygous_prior = HOMOZYGOUS_PRIOR; // default value
+        int no_gq = 0;
         if (GQ_ix != -1) {
+
             // get to the index where GQ is located.
             k = 0; // where we are in gen
             for (j=0; j<GQ_ix; j++){
-                while(gen[k] != ':')
+                if (no_gq){
+                  break;
+                }
+
+                while(gen[k] != ':') {
                     k++;
+                    if (gen[k] == '\n' || gen[k] == '\0') {
+                        no_gq = 1;
+                        break;
+                    }
+                }
                 k++; // step past the ':'
+
+                if (gen[k] == '\n' || gen[k] == '\0') {
+                    no_gq = 1;
+                    break;
+                }
             }
+
             // reached GQ field. read it in.
+            if (!no_gq) {
+                j=0;
 
-            j=0;
-            while (k<len && gen[k] != ':') {
-                GQ[j] = gen[k];
-                k++;
-                j++;
-            }
-            GQ[j] = '\0';
+                while (gen[k] != '\0' && gen[k] != ':') {
+                    GQ[j] = gen[k];
+                    k++;
+                    j++;
+                }
+                GQ[j] = '\0';
 
-            snpfrag[var].homozygous_prior = atof(GQ) / -10; // log prior probability of homozygousity
-        } else{
-            snpfrag[var].homozygous_prior = HOMOZYGOUS_PRIOR;
+                snpfrag[var].homozygous_prior = atof(GQ) / -10.0; // log prior probability of homozygousity
+            }
         }
 
         var++;

diff --git a/recipes/HiC_Longread/Snakefile b/recipes/HiC_Longread/Snakefile
@@ -59,7 +59,7 @@ rule longread_extract_hairs:
     output: expand('data/longread/{chrom}',chrom=chroms)
     run:
         for chrom,bam,frag in zip(chroms,input,output):
-            shell('{EXTRACTHAIRS} --new_format 1 --bam {bam} --VCF {VCF_DIR}/{chrom}.vcf > {frag}')
+            shell('{EXTRACTHAIRS} --pacbio 1 --new_format 1 --bam {bam} --VCF {VCF_DIR}/{chrom}.vcf > {frag}')
 
 # convert Hi-C bam files to haplotype fragment files
 rule hic_extract_hairs:

diff --git a/reproduce_hapcut2_paper/run_hapcut2_fosmid/README.md b/reproduce_hapcut2_paper/run_hapcut2_fosmid/README.md
@@ -0,0 +1,46 @@
+Run HapCUT2 on Duitama et al fosmid data
+======
+
+This directory contains a bash script for running HapCUT2 on the fosmid data from
+the 2012 Duitama et al study [1]:
+
+Steps in the script:
+1. download 1000 Genomes VCFs (phase 1, hg18) for NA12878 trio [2]
+2. filter the VCF for only NA12878 heterozygous sites (and separate by chromosome)
+3. check that variant indices and genomic coordinates match between filtered VCF and Duitama phased haplotype data
+4. download fosmid fragment files and remove first line (matrix dimensions) which is incompatible with HapCUT2
+5. run HapCUT2 on each fragment file to produce HapCUT2 haplotype files
+6. use calculate_haplotype_statistics.py script to calculate the haplotype accuracy using the 1000G trio-phased VCF as ground truth
+
+More detailed explanation of steps 1-3:
+HapCUT2 requires a VCF file along with the fragment data, such that the variant indices
+in the fragment file match those in the VCF. The original filtered VCF from 1000 Genomes
+used to generate the fosmid fragments is not provided, so it is necessary to produce one. There is also a
+need for ground-truth haplotypes to compare against, and the phased 1000g variants
+can also be used for this purpose. So, the bash script
+downloads VCF files for NA12878 from 1000 Genomes project phase 1, and filters
+them manually (for NA12878 heterozygous sites only) and then checks
+that the genomic coordinates and variant indices match against refhap-based phased
+haplotype data files provided by Duitama et al.
+
+## Steps to run
+1. on a linux machine (tested using ubuntu) clone repository and build HapCUT2 using the makefile and instructions in main github README
+2. change to this directory (HapCUT2/reproduce_hapcut2_paper/run_hapcut2_fosmid)
+3. run run_hapcut2_fosmid_data.sh using bash:
+
+```
+bash run_hapcut2_fosmid_data.sh
+```
+or
+```
+chmod +x run_hapcut2_fosmid_data.sh
+./run_hapcut2_fosmid_data.sh
+```
+
+The HapCUT2 haplotypes will be in ```data/hapcut2_haplotypes``` and the error rates compared to 1000G trio haplotypes will be printed to console.
+
+References:
+
+[1] Duitama, J., McEwen, G.K., Huebsch, T., Palczewski, S., Schulz, S., Verstrepen, K., Suk, E.K. and Hoehe, M.R., 2011. Fosmid-based whole genome haplotyping of a HapMap trio child: evaluation of Single Individual Haplotyping techniques. Nucleic acids research, 40(5), pp.2041-2053.
+
+[2] 1000 Genomes Project Consortium, 2010. A map of human genome variation from population-scale sequencing. Nature, 467(7319), p.1061.
diff --git a/reproduce_hapcut2_paper/run_hapcut2_fosmid/create_NA12878_hg18_vcfs.py b/reproduce_hapcut2_paper/run_hapcut2_fosmid/create_NA12878_hg18_vcfs.py
@@ -0,0 +1,35 @@
+
+header = ''
+prev_chrom = None
+chroms = ['chr{}'.format(x) for x in range(1,23)] + ['chrX']
+output_filenames = ['data/NA12878_hg18_VCFs/{}.vcf'.format(c) for c in chroms]
+output_files = {chrom : open(f,'w') for (chrom,f) in zip(chroms,output_filenames)}
+with open("data/temp/NA12878_trio_genotypes_original.vcf",'r') as infile:
+    for line in infile:
+        # header lines
+        if line[0] == '#':
+            if line[:6] == '#CHROM': # need to remove parent sample labels
+                el = line.strip().split('\t')
+                line = '\t'.join(el[:9] + [el[11]])
+            header += line # add line to header so we can print it to separate chrom files
+            continue
+
+        el = line.strip().split('\t')
+        assert(len(el) == 12) # VCF line with 3 individuals has 12 elements
+        el = el[:9] + [el[11]] # remove parents
+
+        el[0] = 'chr' + el[0]
+        chrom = el[0] # add chr label
+        if chrom != prev_chrom: # we're on to a new chromosome
+            print(header, file=output_files[chrom]) # print header
+
+        new_line = '\t'.join(el)
+        # heterozygous variants for NA12878 only
+        if el[9][:3] in ['0/1','1/0','0|1','1|0']:
+            print(new_line, file=output_files[chrom])
+
+        prev_chrom = chrom
+
+# close new VCFs
+for f in output_files.values():
+    f.close()
diff --git a/reproduce_hapcut2_paper/run_hapcut2_fosmid/run_hapcut2_fosmid_data.sh b/reproduce_hapcut2_paper/run_hapcut2_fosmid/run_hapcut2_fosmid_data.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+HAPCUT2=../../build/HAPCUT2
+HAP_STATISTICS=../../utilities/calculate_haplotype_statistics.py
+
+
+mkdir -p data/temp
+# download the thousand genomes VCFs
+echo "DOWNLOADING 1000 GENOMES VCFS FOR NA12878 TRIO"
+wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/release/2010_07/trio/snps/CEU.trio.2010_03.genotypes.vcf.gz \
+      -O data/temp/NA12878_trio_genotypes_original.vcf.gz
+# unzip the thousand genomes VCFs
+gunzip -c data/temp/NA12878_trio_genotypes_original.vcf.gz > data/temp/NA12878_trio_genotypes_original.vcf
+
+echo "FILTERING 1000 GENOMES VCFS FOR JUST NA12878 HETEROZYGOUS SITES"
+# process the thousand genomes VCFs to be just NA12878 heterozygous sites
+mkdir data/NA12878_hg18_VCFs
+python3 create_NA12878_hg18_vcfs.py
+
+echo "CHECKING THAT NA12878 VARIANT INDICES/COORDINATES MATCH DUITAMA PHASED DATA"
+# download the Duitama et al 'phased matrix'. It has genomic coordinates for the variants
+# we basically want to make sure that our processed NA12878 VCF
+# has matching variant indices and genomic coordinates
+# to Duitama's own Refhap-based phased data.
+wget http://www.molgen.mpg.de/~genetic-variation/SIH/Data/haplotypes.tar.gz \
+     -O data/temp/duitama_haplotypes.tar.gz
+mkdir data/temp/duitama_haplotypes
+tar -xzf data/temp/duitama_haplotypes.tar.gz -C data/temp/duitama_haplotypes
+python3 sanity_check_variant_indices.py
+
+echo "DOWNLOADING AND PROCESSING DUITAMA ET AL FRAGMENT FILES"
+# download and unzip the Duitama et al phasing matrices (fragment files)
+wget http://www.molgen.mpg.de/~genetic-variation/SIH/Data/phasing_matrices.tar.gz \
+     -O data/temp/phasing_matrices.tar.gz
+mkdir data/NA12878_fosmid_data_original
+mkdir data/NA12878_fosmid_data_formatted
+tar -xzf data/temp/phasing_matrices.tar.gz -C data/NA12878_fosmid_data_original
+
+# remove first line of fragment files
+# Duitama et al phasing matrices have the first line as matrix dimensions
+# this does not work with HapCUT2
+for i in {1..22} X
+    do
+    tail -n +2 data/NA12878_fosmid_data_original/chr${i}.matrix.SORTED \
+    > data/NA12878_fosmid_data_formatted/chr${i}.matrix.SORTED
+done
+
+echo "RUNNING HAPCUT2 ON EACH CHROMOSOME OF DUITAMA ET AL DATA"
+mkdir data/hapcut2_haplotypes
+# run HapCUT2 on each chromosome
+for i in {1..22} X; do
+    $HAPCUT2 --fragments data/NA12878_fosmid_data_formatted/chr${i}.matrix.SORTED \
+        --vcf data/NA12878_hg18_VCFs/chr${i}.vcf \
+        --out data/hapcut2_haplotypes/chr${i}.hap
+done
+
+echo "COMPARING ASSEMBLED HAPLOTYPE TO 1000G PHASE DATA FOR ACCURACY"
+python3 $HAP_STATISTICS -v1 data/NA12878_hg18_VCFs/chr{1..22}.vcf \
+                            data/NA12878_hg18_VCFs/chrX.vcf \
+                        -h1 data/hapcut2_haplotypes/chr{1..22}.hap \
+                            data/hapcut2_haplotypes/chrX.hap \
+                        -v2 data/NA12878_hg18_VCFs/chr{1..22}.vcf \
+                            data/NA12878_hg18_VCFs/chrX.vcf
diff --git a/reproduce_hapcut2_paper/run_hapcut2_fosmid/sanity_check_variant_indices.py b/reproduce_hapcut2_paper/run_hapcut2_fosmid/sanity_check_variant_indices.py
@@ -0,0 +1,27 @@
+
+chroms = ['chr{}'.format(x) for x in range(1,23)] + ['chrX']
+
+for c in chroms:
+    duitama_chrom_pos = []
+    with open("data/temp/duitama_haplotypes/{}.real_refhap.phase".format(c),'r') as duitama_file:
+        for line in duitama_file:
+            el = line.strip().split()
+            pos = int(el[0])
+            duitama_chrom_pos.append((c, pos))
+
+    NA12878_chrom_pos = []
+    with open("data/NA12878_hg18_VCFs/{}.vcf".format(c),'r') as NA12878_vcf:
+        for line in NA12878_vcf:
+            # header lines
+            if line[0] == '#':
+                continue
+
+            el = line.strip().split('\t')
+            chrom = el[0] # add chr label
+            pos = int(el[1])
+
+            NA12878_chrom_pos.append((chrom,pos))
+
+    print("checking that indices in Duitama haplotype with {} elements matches indices in NA12878 haplotype with {} elements...".format(len(duitama_chrom_pos), len(NA12878_chrom_pos)))
+    assert(duitama_chrom_pos == NA12878_chrom_pos)
+    print("...PASSED")