In [None]:
mkdir -p ~/QUILT/package_2021_01_15A/
cp -r /home/jupyter-bgi23_siyang/software/QUILT/package_2021_01_15A/* ~/QUILT/package_2021_01_15A/
cd ~/QUILT

In [None]:
/home/jupyter-bgi23_siyang/software/QUILT/QUILT.R -h

# Input formats
Input Reference panel. IMPUTE format hap and legend format files with reference haplotypes. These can be made from haplotype VCFs using bcftools convert --haplegendsample. Alternatively, they can be made manually. The haplotype file is a gzipped file with no header and no rownames, with one row per SNP, with one column per reference haplotype, space separated, and values of 0 (ref) and 1 (alt). The legend file is a gzipped file with no rownames, a header file including position for the physical position in 1 based coordinates, a0 for the reference allele, and a1 for the alternate allele. An optional sample file and file with samples to exclude can be useful for changing who is used in the reference panel.

Genetic map. File with genetic map information, with 3 white-space delimited columns giving position (1-based), genetic rate map in cM/Mbp, and genetic map in cM

Bams. Given as a bamlist (i.e. a file with one row per sample, the path to the bam)

(Optional) Truth data. phasefile and posfile. Useful for understanding performance. Phasefile has a header row with a name for each sample, matching what is found in the bam file. File is tab separated, one subject per column, with 0 = ref and 1 = alt, separated by a vertical bar |, e.g. 0|0 or 0|1. Note therefore this file has one more row than posfile which has no header. For posfile, this is a file with positions of where to impute, lining up one-to-one with the SNPs of phasefile. File is tab seperated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. Example first row: 11000AG

In [None]:
cat ./package_2021_01_15A/bamlist.1.0.txt

In [None]:
#reference_haplotype_file
zcat ./package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.noNA12878.hap.gz|head -n 5

In [None]:
#reference_legend_file
zcat ./package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.noNA12878.legend.gz|head -n 5

In [None]:
#genetic_map_file
zcat ./package_2021_01_15A/CEU-chr20-final.b38.txt.gz|head -n 5

In [None]:
#Run QUILT

rm -r -f quilt_output
/home/jupyter-bgi23_siyang/software/QUILT/QUILT.R  \
--outputdir=quilt_output \
--chr=chr20 \
--regionStart=2000001 \
--regionEnd=2100000 \
--buffer=10000 \
--bamlist=package_2021_01_15A/bamlist.1.0.txt \
--posfile=package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.posfile.txt \
--phasefile=package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.phasefile.txt \
--reference_haplotype_file=package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.noNA12878.hap.gz \
--reference_legend_file=package_2021_01_15A/ALL.chr20_GRCh38.genotypes.20170504.chr20.2000001.2100000.noNA12878.legend.gz \
--genetic_map_file=package_2021_01_15A/CEU-chr20-final.b38.txt.gz \
--nGen=100 \
--save_prepared_reference=TRUE

# output formats
Output VCF with both SNP annotation information (see below) and per-sample genotype information. Per-sample genotype information includes the following entries

GT Phased genotypes Phased genotype, where each allele is the rounded per-haplotype posterior probability (HD below)

GP Genotype posteriors Posterior probabilities of the three genotypes given the data

DS Diploid dosage Posterior expectation of the diploid genotype i.e. the expected number of copies of the alternate allele

HD Haploid dosages Per-haplotype posterior probability of an alternate allele

Note that in QUILT, genotype posteriors (GP) and dosages (DS) are taken from the main Gibbs sampling, while the phasing results (GT and HD) are taken from an additional special phasing Gibbs sample. As such, phasing results (GT and HD) might not be consistent with genotype information (GP and DS). If consistency is necessary, note that you can create a consistent GP and DS from HD.

In [None]:
cd ~/QUILT/quilt_output
zcat quilt.chr20.2000001.2100000.vcf.gz|head -n 20

In [None]:
#INFO_sorce
#Visualize distribution of the MACH R2 
import sys
import gzip
import re
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns


if __name__ == "__main__":
    in_fname = os.path.expanduser("~")+"/QUILT/quilt_output/quilt.chr20.2000001.2100000.vcf.gz"

    data = []
    with gzip.open(in_fname, "rt") if in_fname.endswith(".gz") else open(in_fname) as f:
        for line in f:
            if line.startswith("#"):
                continue

            col = line.strip().split()

            # af = float(re.search(";?AF=([^;]+)", col[7]).group(1))
            m = re.search(";?INFO_SCORE=([^;]+)", col[7])
            if m:
                r2 = float(m.group(1))
                data.append(r2)


    df = pd.DataFrame({"R2": data})

    f, ax = plt.subplots(1,1,figsize=(6, 4), constrained_layout=True)
    sns.histplot(data=df[df["R2"]!=0], x="R2", kde=True, color="b", ax=ax)
    ax.set_yscale("log")
    plt.show()


In [None]:
pwd