Skip to content

Commit

Permalink
Merge branch 'master' into realignment.
Browse files Browse the repository at this point in the history
Conflicts:
	hairs-src/extracthairs.c
	utilities/README.md
	utilities/calculate_haplotype_statistics.py
  • Loading branch information
vibansal committed Mar 20, 2019
2 parents e3c291e + 1ba3e87 commit 5a6f5df
Show file tree
Hide file tree
Showing 10 changed files with 512 additions and 292 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ sudo make uninstall-hapcut2
## Input:
HapCUT2 requires the following input:
- BAM file for an individual containing reads aligned to a reference genome
- VCF file containing **diploid** SNVs for the individual with respect to the reference
- VCF file containing **diploid** SNVs (and short indel calls) for the individual with respect to the reference genome

**Note: the program does not accept gzipped VCF files**

## To Run:

Expand Down Expand Up @@ -103,7 +105,9 @@ Important note: flag "--split_blocks 1" (compute switch error confidence and aut
Field 11 is useful for controlling mismatch (single SNV) haplotype errors, similarly to field 9. The default behavior of HapCUT2 is to prune individual SNVs for which this confidence is less than 6.98 (probability of error 0.2), as these are highly likely to be errors.

## 10X Genomics Linked-Reads
10X Genomics Linked Reads require an extra step to link short reads together into barcoded molecules:
10X Genomics Linked Reads require an extra step to link short reads together into barcoded molecules.

NOTE: It is required that the BAM reads have the BX (corrected barcode) tag.

(1) use extractHAIRS to convert BAM file to the compact fragment file format containing only haplotype-relevant information. This is a necessary precursor step to running HapCUT2.
```
Expand All @@ -118,6 +122,8 @@ python3 utilities/LinkFragments.py --bam reads.sorted.bam --VCF variants.VCF --f
./build/HAPCUT2 --nf 1 --fragments linked_fragment_file --VCF variantcalls.vcf --output haplotype_output_file
```

It is assumed that reads with the same barcode occurring within 20 kb of another belong to the same molecule, and reads separated by more than this distance are assigned to separate molecules. This distance can be controlled using the ```-d``` parameter in LinkFragments.

## Hi-C (Proximity Ligation) Sequencing Reads

For improved haplotype accuracy with Hi-C reads, use the --HiC 1 option for both extractHAIRS and HapCUT2 steps.
Expand Down
11 changes: 7 additions & 4 deletions hairs-src/extracthairs.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
// author: Vikas Bansal
// program to extract haplotype informative reads from sorted BAM file, input requirements: bamfile and variantfile
//#include "extracthairs.h"
#include<stdio.h>
Expand Down Expand Up @@ -180,7 +179,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
}
// find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome
if (read->tid != prevtid) {
chrom = getindex(ht, read->chrom); // this will return -1 if the contig name is not in the VCF file
chrom = getindex(ht, read->chrom); // this will return -1 if the contig name is not in the VCF file
if (chrom < 0) fprintf(stderr,"chrom \"%s\" not in VCF file, skipping all reads for this chrom.... \n",read->chrom);
else fprintf(stderr,"processing reads mapped to chrom \"%s\" \n",read->chrom);
// doing this for every read, can replace this by string comparison ..april 4 2012
Expand All @@ -198,6 +197,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
}
}
} else chrom = prevchrom;
//if (chrom_missing_index ==1) { prevtid = read->tid; free_readmemory(read); continue; }

fragment.absIS = (read->IS < 0) ? -1 * read->IS : read->IS;
// add check to see if the mate and its read are on same chromosome, bug for contigs, july 16 2012
Expand Down Expand Up @@ -256,8 +256,7 @@ int parse_bamfile_sorted(char* bamfile, HASHTABLE* ht, CHROMVARS* chromvars, VAR
prevtid = read->tid;
free_readmemory(read);
} // end of main while loop

if (fragments > 0) // still fragments left in buffer
if (fragments > 0) // still fragments left in buffer
{
fprintf(stderr, "final cleanup of fragment list: %d current chrom %s %d prev %d \n", fragments, read->chrom, read->position,prevchrom);
if (prevchrom >=0) clean_fragmentlist(flist, &fragments, varlist, -1, read->position, prevchrom); // added extra filter 03/08/18
Expand Down Expand Up @@ -425,6 +424,10 @@ int main(int argc, char** argv) {
fprintf(stderr, "\nERROR: In order to realign variants (including --pacbio and --ont options), reference fasta file must be provided with --ref option.\n");
exit(1);
}
if (MINQ < 4) {
fprintf(stderr, "\nERROR: MINQ must be at least 4.\n");
exit(1);
}
if (bamfiles > 0 && strcmp(variantfile, "None") != 0) {
bamfilelist = (char**) malloc(sizeof (char*)*bamfiles);
for (i = 0; i < bamfiles; i++) bamfilelist[i] = (char*) malloc(1024);
Expand Down
48 changes: 33 additions & 15 deletions hapcut2-src/readinputfiles.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "common.h"
#include <assert.h>

extern int HOMOZYGOUS_PRIOR;
extern float HOMOZYGOUS_PRIOR;
extern int NEW_FRAGFILE_FORMAT;

int fragment_compare(const void *a, const void *b) {
Expand Down Expand Up @@ -217,9 +217,9 @@ int count_variants_vcf(char* vcffile) {
int read_vcffile(char* vcffile, struct SNPfrags* snpfrag, int snps) {
char buffer[500000];
char temp[1000];
char GQ[5];
char GQ[100];
char* gen;
int i = 0, j = 0, k=0, len=0, s = 0, e = 0, var = 0, GQ_ix, format_ix;
int i = 0, j = 0, k=0, s = 0, e = 0, var = 0, GQ_ix, format_ix;
FILE* fp = fopen(vcffile, "r");
while (fgets(buffer, 500000, fp)) {
if (buffer[0] == '#') continue;
Expand Down Expand Up @@ -307,29 +307,47 @@ int read_vcffile(char* vcffile, struct SNPfrags* snpfrag, int snps) {
snpfrag[var].genotypes = (char*) malloc(e - s + 1);
for (j = s; j < e; j++) snpfrag[var].genotypes[j - s] = buffer[j];
snpfrag[var].genotypes[j - s] = '\0';
len = j-s;
//len = j-s;
gen = snpfrag[var].genotypes; // for convenience
snpfrag[var].homozygous_prior = HOMOZYGOUS_PRIOR; // default value
int no_gq = 0;
if (GQ_ix != -1) {

// get to the index where GQ is located.
k = 0; // where we are in gen
for (j=0; j<GQ_ix; j++){
while(gen[k] != ':')
if (no_gq){
break;
}

while(gen[k] != ':') {
k++;
if (gen[k] == '\n' || gen[k] == '\0') {
no_gq = 1;
break;
}
}
k++; // step past the ':'

if (gen[k] == '\n' || gen[k] == '\0') {
no_gq = 1;
break;
}
}

// reached GQ field. read it in.
if (!no_gq) {
j=0;

j=0;
while (k<len && gen[k] != ':') {
GQ[j] = gen[k];
k++;
j++;
}
GQ[j] = '\0';
while (gen[k] != '\0' && gen[k] != ':') {
GQ[j] = gen[k];
k++;
j++;
}
GQ[j] = '\0';

snpfrag[var].homozygous_prior = atof(GQ) / -10; // log prior probability of homozygousity
} else{
snpfrag[var].homozygous_prior = HOMOZYGOUS_PRIOR;
snpfrag[var].homozygous_prior = atof(GQ) / -10.0; // log prior probability of homozygousity
}
}

var++;
Expand Down
2 changes: 1 addition & 1 deletion recipes/HiC_Longread/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ rule longread_extract_hairs:
output: expand('data/longread/{chrom}',chrom=chroms)
run:
for chrom,bam,frag in zip(chroms,input,output):
shell('{EXTRACTHAIRS} --new_format 1 --bam {bam} --VCF {VCF_DIR}/{chrom}.vcf > {frag}')
shell('{EXTRACTHAIRS} --pacbio 1 --new_format 1 --bam {bam} --VCF {VCF_DIR}/{chrom}.vcf > {frag}')

# convert Hi-C bam files to haplotype fragment files
rule hic_extract_hairs:
Expand Down
46 changes: 46 additions & 0 deletions reproduce_hapcut2_paper/run_hapcut2_fosmid/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
Run HapCUT2 on Duitama et al fosmid data
======

This directory contains a bash script for running HapCUT2 on the fosmid data from
the 2012 Duitama et al study [1]:

Steps in the script:
1. download 1000 Genomes VCFs (phase 1, hg18) for NA12878 trio [2]
2. filter the VCF for only NA12878 heterozygous sites (and separate by chromosome)
3. check that variant indices and genomic coordinates match between filtered VCF and Duitama phased haplotype data
4. download fosmid fragment files and remove first line (matrix dimensions) which is incompatible with HapCUT2
5. run HapCUT2 on each fragment file to produce HapCUT2 haplotype files
6. use calculate_haplotype_statistics.py script to calculate the haplotype accuracy using the 1000G trio-phased VCF as ground truth

More detailed explanation of steps 1-3:
HapCUT2 requires a VCF file along with the fragment data, such that the variant indices
in the fragment file match those in the VCF. The original filtered VCF from 1000 Genomes
used to generate the fosmid fragments is not provided, so it is necessary to produce one. There is also a
need for ground-truth haplotypes to compare against, and the phased 1000g variants
can also be used for this purpose. So, the bash script
downloads VCF files for NA12878 from 1000 Genomes project phase 1, and filters
them manually (for NA12878 heterozygous sites only) and then checks
that the genomic coordinates and variant indices match against refhap-based phased
haplotype data files provided by Duitama et al.

## Steps to run
1. on a linux machine (tested using ubuntu) clone repository and build HapCUT2 using the makefile and instructions in main github README
2. change to this directory (HapCUT2/reproduce_hapcut2_paper/run_hapcut2_fosmid)
3. run run_hapcut2_fosmid_data.sh using bash:

```
bash run_hapcut2_fosmid_data.sh
```
or
```
chmod +x run_hapcut2_fosmid_data.sh
./run_hapcut2_fosmid_data.sh
```

The HapCUT2 haplotypes will be in ```data/hapcut2_haplotypes``` and the error rates compared to 1000G trio haplotypes will be printed to console.

References:

[1] Duitama, J., McEwen, G.K., Huebsch, T., Palczewski, S., Schulz, S., Verstrepen, K., Suk, E.K. and Hoehe, M.R., 2011. Fosmid-based whole genome haplotyping of a HapMap trio child: evaluation of Single Individual Haplotyping techniques. Nucleic acids research, 40(5), pp.2041-2053.

[2] 1000 Genomes Project Consortium, 2010. A map of human genome variation from population-scale sequencing. Nature, 467(7319), p.1061.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@

header = ''
prev_chrom = None
chroms = ['chr{}'.format(x) for x in range(1,23)] + ['chrX']
output_filenames = ['data/NA12878_hg18_VCFs/{}.vcf'.format(c) for c in chroms]
output_files = {chrom : open(f,'w') for (chrom,f) in zip(chroms,output_filenames)}
with open("data/temp/NA12878_trio_genotypes_original.vcf",'r') as infile:
for line in infile:
# header lines
if line[0] == '#':
if line[:6] == '#CHROM': # need to remove parent sample labels
el = line.strip().split('\t')
line = '\t'.join(el[:9] + [el[11]])
header += line # add line to header so we can print it to separate chrom files
continue

el = line.strip().split('\t')
assert(len(el) == 12) # VCF line with 3 individuals has 12 elements
el = el[:9] + [el[11]] # remove parents

el[0] = 'chr' + el[0]
chrom = el[0] # add chr label
if chrom != prev_chrom: # we're on to a new chromosome
print(header, file=output_files[chrom]) # print header

new_line = '\t'.join(el)
# heterozygous variants for NA12878 only
if el[9][:3] in ['0/1','1/0','0|1','1|0']:
print(new_line, file=output_files[chrom])

prev_chrom = chrom

# close new VCFs
for f in output_files.values():
f.close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

HAPCUT2=../../build/HAPCUT2
HAP_STATISTICS=../../utilities/calculate_haplotype_statistics.py


mkdir -p data/temp
# download the thousand genomes VCFs
echo "DOWNLOADING 1000 GENOMES VCFS FOR NA12878 TRIO"
wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/pilot_data/release/2010_07/trio/snps/CEU.trio.2010_03.genotypes.vcf.gz \
-O data/temp/NA12878_trio_genotypes_original.vcf.gz
# unzip the thousand genomes VCFs
gunzip -c data/temp/NA12878_trio_genotypes_original.vcf.gz > data/temp/NA12878_trio_genotypes_original.vcf

echo "FILTERING 1000 GENOMES VCFS FOR JUST NA12878 HETEROZYGOUS SITES"
# process the thousand genomes VCFs to be just NA12878 heterozygous sites
mkdir data/NA12878_hg18_VCFs
python3 create_NA12878_hg18_vcfs.py

echo "CHECKING THAT NA12878 VARIANT INDICES/COORDINATES MATCH DUITAMA PHASED DATA"
# download the Duitama et al 'phased matrix'. It has genomic coordinates for the variants
# we basically want to make sure that our processed NA12878 VCF
# has matching variant indices and genomic coordinates
# to Duitama's own Refhap-based phased data.
wget http://www.molgen.mpg.de/~genetic-variation/SIH/Data/haplotypes.tar.gz \
-O data/temp/duitama_haplotypes.tar.gz
mkdir data/temp/duitama_haplotypes
tar -xzf data/temp/duitama_haplotypes.tar.gz -C data/temp/duitama_haplotypes
python3 sanity_check_variant_indices.py

echo "DOWNLOADING AND PROCESSING DUITAMA ET AL FRAGMENT FILES"
# download and unzip the Duitama et al phasing matrices (fragment files)
wget http://www.molgen.mpg.de/~genetic-variation/SIH/Data/phasing_matrices.tar.gz \
-O data/temp/phasing_matrices.tar.gz
mkdir data/NA12878_fosmid_data_original
mkdir data/NA12878_fosmid_data_formatted
tar -xzf data/temp/phasing_matrices.tar.gz -C data/NA12878_fosmid_data_original

# remove first line of fragment files
# Duitama et al phasing matrices have the first line as matrix dimensions
# this does not work with HapCUT2
for i in {1..22} X
do
tail -n +2 data/NA12878_fosmid_data_original/chr${i}.matrix.SORTED \
> data/NA12878_fosmid_data_formatted/chr${i}.matrix.SORTED
done

echo "RUNNING HAPCUT2 ON EACH CHROMOSOME OF DUITAMA ET AL DATA"
mkdir data/hapcut2_haplotypes
# run HapCUT2 on each chromosome
for i in {1..22} X; do
$HAPCUT2 --fragments data/NA12878_fosmid_data_formatted/chr${i}.matrix.SORTED \
--vcf data/NA12878_hg18_VCFs/chr${i}.vcf \
--out data/hapcut2_haplotypes/chr${i}.hap
done

echo "COMPARING ASSEMBLED HAPLOTYPE TO 1000G PHASE DATA FOR ACCURACY"
python3 $HAP_STATISTICS -v1 data/NA12878_hg18_VCFs/chr{1..22}.vcf \
data/NA12878_hg18_VCFs/chrX.vcf \
-h1 data/hapcut2_haplotypes/chr{1..22}.hap \
data/hapcut2_haplotypes/chrX.hap \
-v2 data/NA12878_hg18_VCFs/chr{1..22}.vcf \
data/NA12878_hg18_VCFs/chrX.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

chroms = ['chr{}'.format(x) for x in range(1,23)] + ['chrX']

for c in chroms:
duitama_chrom_pos = []
with open("data/temp/duitama_haplotypes/{}.real_refhap.phase".format(c),'r') as duitama_file:
for line in duitama_file:
el = line.strip().split()
pos = int(el[0])
duitama_chrom_pos.append((c, pos))

NA12878_chrom_pos = []
with open("data/NA12878_hg18_VCFs/{}.vcf".format(c),'r') as NA12878_vcf:
for line in NA12878_vcf:
# header lines
if line[0] == '#':
continue

el = line.strip().split('\t')
chrom = el[0] # add chr label
pos = int(el[1])

NA12878_chrom_pos.append((chrom,pos))

print("checking that indices in Duitama haplotype with {} elements matches indices in NA12878 haplotype with {} elements...".format(len(duitama_chrom_pos), len(NA12878_chrom_pos)))
assert(duitama_chrom_pos == NA12878_chrom_pos)
print("...PASSED")
Loading

0 comments on commit 5a6f5df

Please sign in to comment.