# gnomAD VCF Extract
##### Updated 06/03/2024
##### Tian Yu, Lara Brown, Selin Kubali

## Note:
This code is not meant to be run on DNANexus and is provided here for convenience. Resources and explanation of how to run are provided at https://github.com/TYTYBU/vcfByGene

#### Goal:
Extract VCF files for exomic regions of given genes from gnomAD data.

#### Required inputs
See https://github.com/TYTYBU/vcfByGene

#### Output
A VCF file containing information from gnomAD for each gene. Output to *selected_genes/hcm/gnomAD/gnomAD_gene_vcfs*


In [None]:
import pandas as pd
import re

### Set parameters

In [19]:
exon_flank_nt = 5 # flanking nucleotides from the start and end of exons
number_of_threads = 4 # number of threads used in bcftools output compression
tag_str = 'skubali' # DNAnexus job tag

project_path = 'project-GGy3Bb0JqBj7zfxY8v4by61X:/'

dx_vcf_in_path = project_path + "selected_genes/hcm/gnomAD/gnomAD_chrom_vcfs/"
dx_vcf_out_path = project_path + "selected_genes/hcm/gnomAD/gnomAD_gene_vcfs/"
mane_transcript = "./resources/MANE.GRCh38.v1.0.select_ensembl_genomic.csv.gz"

dx_resource_path = project_path + "GRCh38_resources/"
dx_script_path = project_path + "scripts/"
diff_bed = 'GRCh38_alldifficultregions.bed.gz'
ref_genome = 'GRCh38_reference_genome.fa'

### Helper functions

### List of gene symbols as input

In [28]:
genes = ["ACTN2", "ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR", "TNNT2", "TPM1", "MYL2", "MYL3", "ACTC1"]

### Load MANE transcript coordinates

In [36]:
df = pd.read_csv(mane_transcript)
df = df.loc[(df['feature'] == 'exon') & (df['gene_name'].isin(genes))]
df = df[['seqname', 'start', 'end', 'gene_name']]
df['exon_flank_start'] = df['start'] - exon_flank_nt
df['exon_flank_end'] = df['end'] + exon_flank_nt
df['region'] = ((df['seqname'] + ':').str.cat(df['exon_flank_start'].astype(str)) + '-').str.cat(df['exon_flank_end'].astype(str))
df

Unnamed: 0,seqname,start,end,gene_name,exon_flank_start,exon_flank_end,region,pos_region
21285,chr1,26067336,26067630,TRIM63,26067331,26067635,chr1:26067331-26067635,26067331-26067635
21288,chr1,26066268,26066440,TRIM63,26066263,26066445,chr1:26066263-26066445,26066263-26066445
21290,chr1,26061166,26061334,TRIM63,26061161,26061339,chr1:26061161-26061339,26061161-26061339
21292,chr1,26060266,26060361,TRIM63,26060261,26060366,chr1:26060261-26060366,26060261-26060366
21294,chr1,26058390,26058623,TRIM63,26058385,26058628,chr1:26058385-26058628,26058385-26058628
...,...,...,...,...,...,...,...,...
473557,chr20,44159618,44160407,JPH2,44159613,44160412,chr20:44159613-44160412,44159613-44160412
473559,chr20,44118505,44118623,JPH2,44118500,44118628,chr20:44118500-44118628,44118500-44118628
473561,chr20,44115665,44116386,JPH2,44115660,44116391,chr20:44115660-44116391,44115660-44116391
473563,chr20,44114782,44114876,JPH2,44114777,44114881,chr20:44114777-44114881,44114777-44114881


In [None]:
genes_not_found = []
genes_found = []
known_large_genes = ["DSP", "TSC2", "TTN", "NCOA3"]

for gene in genes:
    df_gene = df.loc[df['gene_name'] == gene]
    if df_gene.shape[0] > 0:
        genes_found.append(gene)
        
        region_str = ','.join(df_gene['region'].to_list())


        match = re.match(r'^([^:]*)', region_str)
        chrom = match.group(1)


        name_input = "gnomad.exomes.v4.1.sites."+chrom+".vcf.bgz"
        name_output = "vcf_for_" + gene + ".gz" 

        mem_level = "mem1_ssd1_v2_x8"
        
        bcf_cmd = "bcftools view -r " + region_str + " -Oz -o" + name_output + " " + name_input
        dx_input_str = '-iin=' +'"' +dx_vcf_in_path + name_input + '"' + ' -iin=' + '"' + dx_vcf_in_path + name_input + '.tbi' + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'
        dx_command = 'dx run swiss-army-knife --instance-type ' + mem_level + ' -y --brief ' + dx_input_str + ' -icmd="' + bcf_cmd + '" --destination ' + dx_vcf_out_path + ' --tag "' + tag_str + '" --property gene=' + gene
        !{dx_command}


    else:
        genes_not_found.append(gene)
            
print('Genes not found in MANE database:')
print(genes_not_found)