# GWAS with array data for Hearing impairment traits

In this notebook I'll add all the scripts intended to generate summary stats for genotype array data with HI phenotypes. This is the previous step before doing interaction analysis. 

## Columbia's cluster

Run this cell if running your jobs on Columbia's cluster

In [2]:
# Common variables Columbia's cluster
UKBB_PATH=$HOME/UKBiobank
UKBB_yale=$HOME/UKBiobank_Yale_transfer
USER_PATH=$HOME/working
container_lmm=$HOME/containers/lmm.sif
container_marp=$HOME/containers/marp.sif
container_annovar=$HOME/containers/gatk4-annovar.sif
hearing_pheno_path=$UKBB_PATH/phenotype_files/hearing_impairment
tpl_file=$USER_PATH/bioworkflows/admin/csg.yml
formatFile_fastgwa=$USER_PATH/UKBB_GWAS_dev/data/fastGWA_template.yml
formatFile_bolt=$USER_PATH/UKBB_GWAS_dev/data/boltlmm_template.yml
formatFile_saige=$USER_PATH/UKBB_GWAS_dev/data/saige_template.yml
formatFile_regenie=$USER_PATH/UKBB_GWAS_dev/data/regenie_template.yml

# Cleaned Imputed data BGEN input
##genoFile=`echo $UKBB_yale/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
##sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample





## Shared variables for workflows and results

In [3]:
# Workflows
lmm_sos=$USER_PATH/bioworkflows/GWAS/LMM.ipynb
anno_sos=$USER_PATH/bioworkflows/variant-annotation/annovar.ipynb
clumping_sos=$USER_PATH/bioworkflows/GWAS/LD_Clumping.ipynb
extract_sos=$USER_PATH/bioworkflows/GWAS/Region_Extraction.ipynb
snptogene_sos=$USER_PATH/UKBB_GWAS_dev/workflow/snptogene.ipynb

# LMM directories for array data data
lmm_imp_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_array_data
lmm_imp_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_array_data
lmm_imp_dir_saige=$UKBB_PATH/results/SAIGE_results/results_array_data
lmm_imp_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_array_data

# LMM directories for exome data
lmm_exome_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_array_data
lmm_exome_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_array_data
lmm_exome_dir_saige=$UKBB_PATH/results/SAIGE_results/results_array_data
lmm_exome_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_array_data




## Specification of LMM variables

In [4]:
### Specific to REGENIE
bsize=1000
lowmem=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f3393_array/scratch60
lowmem_dir=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f3393_array/scratch60/predictions
trait=bt
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
reverse_log_p=True




## Specification of LD clumping variables

In [5]:
# LD clumping directories
clumping_dir=$UKBB_PATH/results/LD_clumping

# Changes dependending upon which traits are analyzed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.2
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1




# Regenie

## f3393

In [22]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f3393_array
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_500K_array-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2
phenoCol=f3393
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f3393_hearing_aid_500K_array-regenie_2021-10-27.sbatch[0m
INFO: Workflow csg (ID=w0db76696208dacd0) is executed successfully with 1 completed step.



## f2247

In [23]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f2247_array
lmm_sbatch_regenie=~/scripts/f2247_hearing_difficulty_500K_array-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_difficulty_f2247_expandedwhite_110453cases_237318ctrl_500k_PC1_PC2
phenoCol=f2247
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2247_hearing_difficulty_500K_array-regenie_2021-10-27.sbatch[0m
INFO: Workflow csg (ID=wad0e7409c6d1b0fe) is executed successfully with 1 completed step.



## f2257

In [24]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f2257_array
lmm_sbatch_regenie=~/scripts/f2257_hearing_noise_500K_array-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_noise_f2257_expandedwhite_161443cases_237318ctrl_500k_PC1_PC2
phenoCol=f2257
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2257_hearing_noise_500K_array-regenie_2021-10-27.sbatch[0m
INFO: Workflow csg (ID=w42519ad431227916) is executed successfully with 1 completed step.



## f2247_f2257

In [25]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f2247_f2257_array
lmm_sbatch_regenie=~/scripts/Combined_f2247_f2257_500K_array-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2 
phenoCol=f2247_f2257
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/Combined_f2247_f2257_hearing_noise_500K_array-regenie_2021-10-27.sbatch[0m
INFO: Workflow csg (ID=wcc879e6698ce2cab) is executed successfully with 1 completed step.



# LD clumping

## f3393

In [26]:
cwd=~/UKBiobank/results/LD_clumping/102721_f3393_500k_array
clumping_sbatch=~/UKBiobank/results/LD_clumping/102721_f3393_500k_array/ldclumping_$(date +"%Y-%m-%d").sbatch
sumstatsFiles=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f3393_array/*.regenie.snp_stats.gz

## Individuals from the subset of white individuals with exome data 
unrelated_samples=$UKBB_PATH/results/083021_PCA_results/090221_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.europeans.filtered.090221_ldprun_unrelated.filtered.fam
bfile_ref=~/UKBiobank/results/LD_clumping/ref_files/ukb39554_c1_22_v3.imputed.2000.ref_geno.bed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.04
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1

# Select samples filter_samples workflow & create reference file with reference workflow
# Then use default workflow to run the LD clumping
clumping_args="""default
    --cwd $cwd
    --bfile_ref $bfile_ref 
    --sumstatsFiles $sumstatsFiles 
    --unrelated_samples $unrelated_samples 
    --ld_sample_size $ld_sample_size 
    --clump_field $clump_field
    --clump_p1 $clump_p1 
    --clump_p2 $clump_p2 
    --clump_r2 $clump_r2 
    --clump_kb $clump_kb 
    --numThreads $numThreads 
    --job_size $clump_job_size
    --container_lmm $container_lmm
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $clumping_sos \
    --to-script $clumping_sbatch \
    --args "$clumping_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/LD_clumping/102721_f3393_500k_array/ldclumping_2021-10-28.sbatch[0m
INFO: Workflow csg (ID=we7b7a105e030e83a) is executed successfully with 1 completed step.



## f2247

In [25]:
cwd=~/UKBiobank/results/LD_clumping/102721_f2247_500k_array
clumping_sbatch=~/UKBiobank/results/LD_clumping/102721_f2247_500k_array/ldclumping_$(date +"%Y-%m-%d").sbatch
sumstatsFiles=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f2247_array/*.regenie.snp_stats.gz
## Individuals from the subset of white individuals with exome data 
unrelated_samples=$UKBB_PATH/results/083021_PCA_results/090221_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.europeans.filtered.090221_ldprun_unrelated.filtered.fam
bfile_ref=~/UKBiobank/results/LD_clumping/ref_files/ukb39554_c1_22_v3.imputed.2000.ref_geno.bed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.04
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1

# Select samples filter_samples workflow & create reference file with reference workflow
# Then use default workflow to run the LD clumping
clumping_args="""default
    --cwd $cwd
    --bfile_ref $bfile_ref 
    --sumstatsFiles $sumstatsFiles 
    --unrelated_samples $unrelated_samples 
    --ld_sample_size $ld_sample_size 
    --clump_field $clump_field
    --clump_p1 $clump_p1 
    --clump_p2 $clump_p2 
    --clump_r2 $clump_r2 
    --clump_kb $clump_kb  
    --numThreads $numThreads 
    --job_size $clump_job_size
    --container_lmm $container_lmm
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $clumping_sos \
    --to-script $clumping_sbatch \
    --args "$clumping_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m (index=0) is [32mignored[0m due to saved signature
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/LD_clumping/102721_f2247_500k_array/ldclumping_2021-10-28.sbatch[0m
INFO: Workflow csg (ID=wec1daa4251170e23) is ignored with 1 ignored step.



## f2257

In [27]:
cwd=~/UKBiobank/results/LD_clumping/102721_f2257_500k_array
clumping_sbatch=~/UKBiobank/results/LD_clumping/102721_f2257_500k_array/ldclumping_$(date +"%Y-%m-%d").sbatch
sumstatsFiles=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f2257_array/*.regenie.snp_stats.gz
## Individuals from the subset of white individuals with exome data 
unrelated_samples=$UKBB_PATH/results/083021_PCA_results/090221_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.europeans.filtered.090221_ldprun_unrelated.filtered.fam
bfile_ref=~/UKBiobank/results/LD_clumping/ref_files/ukb39554_c1_22_v3.imputed.2000.ref_geno.bed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.04
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1

# Select samples filter_samples workflow & create reference file with reference workflow
# Then use default workflow to run the LD clumping
clumping_args="""default
    --cwd $cwd
    --bfile_ref $bfile_ref 
    --sumstatsFiles $sumstatsFiles 
    --unrelated_samples $unrelated_samples 
    --ld_sample_size $ld_sample_size 
    --clump_field $clump_field
    --clump_p1 $clump_p1 
    --clump_p2 $clump_p2 
    --clump_r2 $clump_r2 
    --clump_kb $clump_kb 
    --numThreads $numThreads 
    --job_size $clump_job_size
    --container_lmm $container_lmm
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $clumping_sos \
    --to-script $clumping_sbatch \
    --args "$clumping_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/LD_clumping/102721_f2257_500k_array/ldclumping_2021-10-28.sbatch[0m
INFO: Workflow csg (ID=w39ace774540257f2) is executed successfully with 1 completed step.



## f2247_f2257

In [28]:
cwd=~/UKBiobank/results/LD_clumping/102721_f2247_f2257_500k_array
clumping_sbatch=~/UKBiobank/results/LD_clumping/102721_f2247_f2257_500k_array/ldclumping_$(date +"%Y-%m-%d").sbatch
sumstatsFiles=~/UKBiobank/results/REGENIE_results/results_array_data/2021-10-27_f2247_f2257_array/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2_f2247_f2257.regenie.snp_stats.gz
## Individuals from the subset of white individuals with exome data 
unrelated_samples=$UKBB_PATH/results/083021_PCA_results/090221_ldprun_unrelated/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.europeans.filtered.090221_ldprun_unrelated.filtered.fam
bfile_ref=~/UKBiobank/results/LD_clumping/ref_files/ukb39554_c1_22_v3.imputed.2000.ref_geno.bed
ld_sample_size=200
clump_field=P
clump_p1=0.01
clump_p2=1
clump_r2=0.04
clump_kb=2000
clump_annotate=BP
numThreads=2
clump_job_size=1

# Select samples filter_samples workflow & create reference file with reference workflow
# Then use default workflow to run the LD clumping
clumping_args="""default
    --cwd $cwd
    --bfile_ref $bfile_ref 
    --sumstatsFiles $sumstatsFiles 
    --unrelated_samples $unrelated_samples 
    --ld_sample_size $ld_sample_size 
    --clump_field $clump_field
    --clump_p1 $clump_p1 
    --clump_p2 $clump_p2 
    --clump_r2 $clump_r2 
    --clump_kb $clump_kb 
    --numThreads $numThreads 
    --job_size $clump_job_size
    --container_lmm $container_lmm
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $clumping_sos \
    --to-script $clumping_sbatch \
    --args "$clumping_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/LD_clumping/102721_f2247_f2257_500k_array/ldclumping_2021-10-28.sbatch[0m
INFO: Workflow csg (ID=w4349dc0d69d7cfe4) is executed successfully with 1 completed step.



# 110921 Regenie with callrate99

## f3393

In [6]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f3393_array
lmm_sbatch_regenie=~/scripts/f3393_hearing_aid_500K_array-regenie_callrate99_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2
phenoCol=f3393
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f3393_hearing_aid_500K_array-regenie_callrate99_2021-11-09.sbatch[0m
INFO: Workflow csg (ID=wa359d7ce3ee4c049) is executed successfully with 1 completed step.



## f2247 

In [6]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/2021-11-09_f2247_array
lmm_sbatch_regenie=~/scripts/f2247_hearing_difficulty_500K_array_callrate99-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_difficulty_f2247_expandedwhite_110453cases_237318ctrl_500k_PC1_PC2
phenoCol=f2247
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2247_hearing_difficulty_500K_array_callrate99-regenie_2021-11-09.sbatch[0m
INFO: Workflow csg (ID=wceb74800de03829f) is executed successfully with 1 completed step.



## f2257

In [7]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f2257_array
lmm_sbatch_regenie=~/scripts/f2257_hearing_noise_500K_array_callrate99-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Hearing_noise_f2257_expandedwhite_161443cases_237318ctrl_500k_PC1_PC2
phenoCol=f2257
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/f2257_hearing_noise_500K_array_callrate99-regenie_2021-11-09.sbatch[0m
INFO: Workflow csg (ID=w1ab5363be5c8ca04) is executed successfully with 1 completed step.



## f2247_f2257

In [8]:
## All filters set to 0 because the version of the bfile has already been QC'ed previously and there is not need to do it here
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
lmm_dir_regenie=$lmm_imp_dir_regenie/$(date +"%Y-%m-%d")_f2247_f2257_array
lmm_sbatch_regenie=~/scripts/Combined_f2247_f2257_500K_array_callrate99-regenie_$(date +"%Y-%m-%d").sbatch
#Use the phenotype file for the 500K individuals
phenoFile=$hearing_pheno_path/fulldb_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2 
phenoCol=f2247_f2257
covarCol=sex
qCovarCol="age PC1 PC2"
#Use the original bed files that passed QC using Megan's parameters geno=0.01, mind=0.1, maf=0.01, hwe=5e-08
bfile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
# Use the same bfile here for the association
genoFile=$UKBB_PATH/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.bed
## After doing the annotation you can create the annotatted manhatan plot
anno_file=
label_annotate=SNP
known_vars=
new_vars=
lmm_job_size=1
ylim=0
bgenMinMAF=0.001
bgenMinINFO=0.8
# If --annotate then it will add the label to the plot otherwise --no-annotate
lowmem_dir=$lmm_dir_regenie/scratch60/predictions
lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --anno_file $anno_file
    --label_annotate $label_annotate
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run ~/working/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args" 

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/scripts/Combined_f2247_f2257_500K_array_callrate99-regenie_2021-11-09.sbatch[0m
INFO: Workflow csg (ID=wf5f1e39ec0618388) is executed successfully with 1 completed step.

