# GWAS with array data for Hearing impairment traits

In this notebook I'll add all the scripts intended to generate summary stats for genotype array data with HI phenotypes. This is the previous step before doing interaction analysis. 

## Columbia's cluster

Run this cell if running your jobs on Columbia's cluster

In [6]:
# Common variables Columbia's cluster
UKBB_PATH=$HOME/UKBiobank
UKBB_yale=$HOME/UKBiobank_Yale_transfer
USER_PATH=$HOME/project
container_lmm=$HOME/containers/lmm.sif
container_marp=$HOME/containers/marp.sif
container_annovar=$HOME/containers/gatk4-annovar.sif
hearing_pheno_path=$UKBB_PATH/phenotype_files/hearing_impairment
tpl_file=$USER_PATH/bioworkflows/admin/csg.yml
formatFile_fastgwa=$USER_PATH/UKBB_GWAS_dev/data/fastGWA_template.yml
formatFile_bolt=$USER_PATH/UKBB_GWAS_dev/data/boltlmm_template.yml
formatFile_saige=$USER_PATH/UKBB_GWAS_dev/data/saige_template.yml
formatFile_regenie=$USER_PATH/UKBB_GWAS_dev/data/regenie_template.yml

# Cleaned Imputed data BGEN input
##genoFile=`echo $UKBB_yale/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
##sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample





## Shared variables for workflows and results

In [7]:
# Workflows
lmm_sos=$USER_PATH/bioworkflows/GWAS/LMM.ipynb
anno_sos=$USER_PATH/bioworkflows/variant-annotation/annovar.ipynb
clumping_sos=$USER_PATH/bioworkflows/GWAS/LD_Clumping.ipynb
extract_sos=$USER_PATH/bioworkflows/GWAS/Region_Extraction.ipynb
snptogene_sos=$USER_PATH/UKBB_GWAS_dev/workflow/snptogene.ipynb

# LMM directories for array data data
lmm_imp_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_array_data
lmm_imp_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_array_data
lmm_imp_dir_saige=$UKBB_PATH/results/SAIGE_results/results_array_data
lmm_imp_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_array_data

# LMM directories for exome data
lmm_exome_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_array_data
lmm_exome_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_array_data
lmm_exome_dir_saige=$UKBB_PATH/results/SAIGE_results/results_array_data
lmm_exome_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_array_data




## Specification of LMM variables

In [8]:
### Specific to REGENIE
bsize=1000
lowmem=$HOME/scratch60/
lowmem_dir=$HOME/scratch60/predictions
trait=bt
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
reverse_log_p=True




## Specification of LD clumping variables

In [4]:
# LD clumping directories
clumping_dir=$UKBB_PATH/results/LD_clumping

# Changes dependending upon which traits are analyzed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.2
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1




# f3393

In [None]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imputed_dir_regenie/$(date +"%Y-%m-%d")_f3393_array
lmm_sbatch_regenie=$USER_PATH/UKBB_GWAS_dev/output/f3393_hearing_aid_200K-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2
phenoCol=f3393
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=

# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 