From d96cafd7cf64b2e4d3f6e6a2c09568a40228d1f9 Mon Sep 17 00:00:00 2001 From: Daniel Ariad Date: Fri, 19 Nov 2021 18:20:26 -0500 Subject: [PATCH] Daily build --- MAKE_OBS_TAB.py | 4 ++-- pipeline.txt | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/MAKE_OBS_TAB.py b/MAKE_OBS_TAB.py index b787185..189afb1 100644 --- a/MAKE_OBS_TAB.py +++ b/MAKE_OBS_TAB.py @@ -86,10 +86,10 @@ def retrive_bases(bam_filename,legend_filename,fasta_filename,handle_multiple_ob pos = 0 for pileupcolumn in samfile.pileup(**arg): - while pileupcolumn.pos > pos-1: ### Chromosomal position starts from 1 in legend table, while it starts from 0 in the pileup iterator. + while pileupcolumn.reference_pos > pos-1: ### Chromosomal position starts from 1 in legend table, while it starts from 0 in the pileup iterator. chr_id,pos,ref,alt = next(leg_tab_iterator) - if pileupcolumn.pos == pos-1: + if pileupcolumn.reference_pos == pos-1: rows = [obs_tuple(pos, pileupread.alignment.query_name, pileupread.alignment.query_sequence[pileupread.query_position]) for pileupread in pileupcolumn.pileups if pileupread.query_position!=None] # query_position is None if the base on the padded read is a deletion or a skip (e.g. spliced alignment). diff --git a/pipeline.txt b/pipeline.txt index 1d19154..0c154e5 100644 --- a/pipeline.txt +++ b/pipeline.txt @@ -110,9 +110,15 @@ The script `MAKE_REF_PANEL.py` creates reference panels for LD-PGTA, using phase We run the script with the following arguments and flags: -`python3 MAKE_REF_PANEL.py EUR_panel.samples ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz`, +`python3 MAKE_REF_PANEL.py EUR_panel.samples ALL.chr21.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz --mask 20160622.chr1.mask.fasta.gz`, -where the first argument is the IMPUTE2 SAMPLE file. The second required argument is a VCF filename that correspond to a single chromosome. +where the first argument is the IMPUTE2 SAMPLE file. The second required argument is a VCF filename that correspond to a single chromosome. The third argument an accessibility mask file in a gzipped FASTA format and is optional. Supplying an accessibility mask file will reduce false SNPs in regions of the genome that are less accessible to NGS methods. [GRCh38 genome accessibility masks for 1000 Genomes data](https://www.internationalgenome.org/announcements/genome-accessibility-masks/) are available from: [http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/working/20160622_genome_mask_GRCh38/](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/working/20160622_genome_mask_GRCh38/). + +In addition, the following flags are supported: + +`--output-directory ` _The directory in which the reference panel would be created._ + +`--force-module cyvcf2/pysam` _By deafult cyvcf2 module would be used. This allows to use pysam instead._ # Tabulation of allele observations at known SNPs # @@ -304,7 +310,7 @@ The next step is to convert the reference panel VCF to BCF format: `--exclude-types indels,mnps,ref,bnd,other \`\ `--min-alleles 2 \`\ `--max-alleles 2 \`\ -`--min-ac 1 \`\ +`--min-ac 1:minor \`\ `--phased \`\ `--exclude 'AN!=2*N_SAMPLES' \`\ `--output-file chr21_EUR_panel.bcf \`\