# Post-GWAS analyses

# Aim

The aim of this notebook is to be able to carry out post-GWAS analyses such as SNP annotation to genes, expression pathway and other.

Here [spnGeneSets v1.12](https://www.umc.edu/SoPH/Departments-and-Faculty/Data-Science/Research/Services/Software.html) is used, however FUMA is a web-based resource that is also useful for this purpose 


## Usage



In [1]:
[global]
# the output directory for generated files
parameter: cwd = path
# Path sumstats file
parameter: sumstatsFile = path
# Genome assembly hg_37, hg_38
parameter: hg = int
#Job size
parameter:job_size=1
# Load annovar module from cluster
parameter: annovar_module = '''
module load ANNOVAR/2020Jun08-foss-2018b-Perl-5.28.0
echo "Module annovar loaded"
{cmd}
'''
# Software container option
parameter: container_annovar = 'gaow/gatk4-annovar'

In [2]:
# Annotate snps to gene
[snp_to_gene]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for SNP
parameter: snp = 'SNP'
input: sumstatsFile
output: f'{_input:nn}.gene_ann'      
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '10G', tags = f'{step_name}_{_output:bn}'
R: expand='${ }', stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    library('snpGeneSets')
    library('dplyr')
    # Import the sumstats file as dataframe
    data <- read.table(gzfile('${_input}'), header=T)
    head(data)
    # Filter SNPs with p-val <5e-06
    # Subset data to obtain only chr, pos and snp for gene mapping
    sig.p <- data %>%
      filter(P < 5e-8) %>%
      mutate(chr = CHR,
             pos = ${bp},
             snp = as.character(${snp})) %>%
      select(chr, pos, snp)
    head(sig.p)
    # Get the annotation of SNPs with different genome assemblies
    snpMapAnn<- getSNPMap(sig.p$snp, GRCh=${hg})
    # Mapping SNPs to genes (define gene boundary ‘up’ for the upstream region and ‘down’ for the downstream region with default value of 2,000 bp for both)
    snpGeneMapAnn<- snp2Gene(snpMapAnn$rsid_map$snp)
    cat("The unique number of genes is",length(unique(snpGeneMapAnn$map$gene_id),"\n"))
    cat("The number of variants that could not be mapped to a gene is:",length(snpGeneMapAnn$other),"\n")
    #Get the gene-name and gene-id for the mapped variants
    gene_mapped <- getGeneMap(snpGeneMapAnn$map$gene_id)$gene_map
    # Merge the datasets
    snp_gene = merge(x = snpMapAnn37$rsid_map,y = snpGeneMapAnn$map[,c("snp", "gene_id")],by="snp", all.x=TRUE)
    snp_gene_2 = merge(x = snp_gene,y = gene_mapped[,c("gene_id", "gene_name")],by="gene_id", all.x=TRUE)
    names(snp_gene_2)[names(snp_gene_2) == 'snp'] <- 'SNP'
    snp_gene_3 = merge(x = snp_gene_2,y = data[,c("A1", "A2", "N", "AF1","P","BETA", "SE", "INFO","SNP")],by="SNP", all.x=TRUE)
    # Get the final table with ordered pval
    final_gene_set <- snp_gene_3 %>%
     select(chr, ${snp}, pos, A1, A2, N, AF1, BETA, SE, ${pval}, INFO, gene_id, gene_name) %>%
     arrange(P)
    names(final_gene_set)[names(final_gene_set) == 'chr'] <- 'CHR'
    names(final_gene_set)[names(final_gene_set) == 'pos'] <- 'POS'
    # Write results to a table
    write.table(final_gene_set, '${_output}', sep = "\t", quote=FALSE, row.names=FALSE)

In [None]:
# Merge all the bimfiles into a single file to use later with awk
# Only need to run this cell once
[bim_merge]
# Path to the original bim files
paremeter: bimfiles = path
# Name for the merged bim files
parameter: bim_name = path
input: bimfiles
output: bim_name
task: trunk_workers = 1, walltime = '10h', mem = '10G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
      cat ${_input} > ${_output}

In [None]:
# Get the list of significantly associated SNPs
[annovar_1]
# Column name for BP
parameter: bp = 'POS'
# Column name for p-value
parameter: pval = 'P'
# Column name for SNP
parameter: snp = 'SNP'
input: sumstatsFile
output: f'{cwd}/{_input:bnn}.snp_annotate'
task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '10G', tags = f'{step_name}_{_output:bn}'
R: expand='${ }', stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    library('dplyr')
    # Import the sumstats file as dataframe
    data <- read.table(gzfile('${_input}'), header=T)
    # Filter SNPs with p-val <5e-06
    # Subset data to obtain only chr, pos and snp for gene mapping
    sig.p <- data %>%
      filter(P < 5e-8) %>%
      select(SNP)
    write.table(sig.p, '${_output}', sep = " ", quote=FALSE, row.names=FALSE, col.names=FALSE) 

In [None]:
# Get chr, start, end, ref_allele, alt_allele format
[annovar_2]
parameter: bim_name = path
output: f'{_input:n}.avinput'
task: trunk_workers = 1, walltime = '10h', mem = '10G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout' 
    awk -F" " 'FNR==NR {lines[$1]; next} $2 in lines ' ${_input} ${bim_name} > ${_output:n}.tmp
    awk '{if ($2 ~ /D/) {print $1, $4, $4 + (length ($6) - length ($5)), $6, $5 } else {print $1, $4, $4, $6, $5 }}'  ${_output:n}.tmp >  ${_output}
    # remove temporary files
    rm -f ${_output:n}.tmp 

In [None]:
# Annotate variants file using ANNOVAR
[annovar_3]
# humandb path for ANNOVAR
parameter: humandb = path
# Human genome build
parameter: build = 'hg38'
#add xreffile to option without -exonicsplicing
#mart_export_2019_LOFtools3.txt #xreffile latest option -> Phenotype description,HGNC symbol,MIM morbid description,CGD_CONDITION,CGD_inh,CGD_man,CGD_comm,LOF_tools
#parameter: x_ref = path(f"{humandb}/mart_export_2019_LOFtools3.txt")
# Annovar protocol
parameter: protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'phastConsElements30way', 'encRegTfbsClustered', 'gwasCatalog', 'gnomad211_genome', 'gnomad211_exome', 'gme', 'kaviar_20150923', 'abraom', 'avsnp150', 'dbnsfp41a', 'dbscsnv11', 'regsnpintron', 'clinvar_20200316', 'gene4denovo201907']
# Annovar operation
parameter: operation = ['g', 'g', 'g', 'g', 'r', 'r', 'r', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f']
# Annovar args
parameter: arg = ['"-splicing 12 -exonicsplicing"', '"-splicing 30"', '"-splicing 12 -exonicsplicing"', '"-splicing 12"', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
output: f'{cwd}/{_input:bn}.{build}_multianno.csv'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}', template = '{cmd}' if executable('annotate_variation.pl').target_exists() else annovar_module
bash: container=container_annovar, volumes=[f'{humandb:a}:{humandb:a}'], expand="${ }", stderr=f'{_output}.stderr', stdout=f'{_output}.stdout'
    #do not add -intronhgvs as option -> writes cDNA variants as HGVS but creates issues (+2 splice site reported only)
    #-nastring . can only be . for VCF files
    #regsnpintron might cause shifted lines (be carefull using)
    table_annovar.pl \
        ${_input} \
        ${humandb} \
        -buildver ${build} \
        -out ${_output:nn}\
        -remove \
        -polish \
        -nastring . \
        -protocol ${",".join(protocol)} \
        -operation ${",".join(operation)} \
        -arg ${",".join(arg)} \
        -csvout 

## To run this notebook

In [6]:
cwd=/home/dc2325/scratch60/output/
sumstatsFile=/gpfs/gibbs/pi/dewan/data/UKBiobank/results/REGENIE_results/results_exome_data/f3393_hearing_aid_exomes/010421_UKBB_Hearing_aid_f3393_128254ind_exomes_hearing_aid_cat.regenie.snp_stats.gz
hg=38
job_size=1
container_annovar=/home/dc2325/scratch60/annovar.sif
bimfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bim`
bim_name=/home/dc2325/scratch60/output/ukb23155_chr1_chr22.bim
humandb=/gpfs/ysm/datasets/db/annovar/humandb

sos run ~/project/UKBB_GWAS_dev/workflow/snptogene.ipynb annovar \
    --cwd $cwd \
    --sumstatsFile $sumstatsFile\
    --bim_name $bim_name \
    --hg $hg \
    --job_size $job_size \
    --humandb $humandb\
    --container_annovar $container_annovar\
    -s build

INFO: Running [32mannovar_1[0m: Get the list of significantly associated SNPs
INFO: [32mannovar_1[0m is [32mcompleted[0m.
INFO: [32mannovar_1[0m output:   [32m/home/dc2325/scratch60/output/010421_UKBB_Hearing_aid_f3393_128254ind_exomes_hearing_aid_cat.regenie.snp_annotate[0m
INFO: Running [32mannovar_2[0m: Get chr, start, end, ref_allele, alt_allele format
INFO: [32mannovar_2[0m is [32mcompleted[0m.
INFO: [32mannovar_2[0m output:   [32m/home/dc2325/scratch60/output/010421_UKBB_Hearing_aid_f3393_128254ind_exomes_hearing_aid_cat.regenie.avinput[0m
INFO: Running [32mannovar_3[0m: Annotate variants file using ANNOVAR
INFO: [32mannovar_3[0m is [32mcompleted[0m.
INFO: [32mannovar_3[0m output:   [32m/home/dc2325/scratch60/output/010421_UKBB_Hearing_aid_f3393_128254ind_exomes_hearing_aid_cat.regenie.hg38_multianno.csv[0m
INFO: Workflow annovar (ID=w85450d9626e3d7a0) is executed successfully with 3 completed steps.
