# Association analysis imputed data

Do the association analysis for the Hearing impairment traits with the white european individuals present in the 500K samples

## Columbia's cluster

In [1]:
# Common variables Columbia's cluster
UKBB_PATH=$HOME/UKBiobank
UKBB_yale=$HOME/UKBiobank_Yale_transfer
USER_PATH=$HOME/working
container_lmm=$HOME/containers/lmm.sif
container_marp=$HOME/containers/marp.sif
container_annovar=$HOME/containers/gatk4-annovar.sif
hearing_pheno_path=$UKBB_PATH/phenotype_files/hearing_impairment
tpl_file=$USER_PATH/bioworkflows/admin/csg.yml
formatFile_fastgwa=$USER_PATH/UKBB_GWAS_dev/data/fastGWA_template.yml
formatFile_bolt=$USER_PATH/UKBB_GWAS_dev/data/boltlmm_template.yml
formatFile_saige=$USER_PATH/UKBB_GWAS_dev/data/saige_template.yml
formatFile_regenie=$USER_PATH/UKBB_GWAS_dev/data/regenie_template.yml

## Shared variables for workflows and results

In [2]:
# Workflows
lmm_sos=$USER_PATH/bioworkflows/GWAS/LMM.ipynb
anno_sos=$USER_PATH/bioworkflows/variant-annotation/annovar.ipynb
clumping_sos=$USER_PATH/bioworkflows/GWAS/LD_Clumping.ipynb
extract_sos=$USER_PATH/bioworkflows/GWAS/Region_Extraction.ipynb
snptogene_sos=$USER_PATH/UKBB_GWAS_dev/workflow/snptogene.ipynb

# LMM directories for imputed data
lmm_imp_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_imputed_data
lmm_imp_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_imputed_data
lmm_imp_dir_saige=$UKBB_PATH/results/SAIGE_results/results_imputed_data
lmm_imp_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_imputed_data

# LMM directories for exome data
lmm_exome_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_exome_data
lmm_exome_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_exome_data
lmm_exome_dir_saige=$UKBB_PATH/results/SAIGE_results/results_exome_data
lmm_exome_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_exome_data

In [3]:
## LMM variables 
## Specific to Bolt_LMM
LDscoresFile=$UKBB_PATH/LDSCORE.1000G_EUR.tab.gz
geneticMapFile=$UKBB_PATH/genetic_map_hg19_withX.txt.gz
covarMaxLevels=10
numThreads=20
bgenMinMAF=0.001
bgenMinINFO=0.8
lmm_job_size=1
ylim=0

### Specific to FastGWA (depeding if you run from Yale or Columbia)
####Yale's cluster
grmFile=$UKBB_PATH/results/FastGWA_results/results_imputed_data/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.grm.sp
####Columbia's cluster
grmFile=$UKBB_yale/results/FastGWA_results/results_imputed_data/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.grm.sp

### Specific to SAIGE
bgenMinMAC=4
trait_type=binary
loco=TRUE
sampleCol=IID

### Specific to REGENIE
bsize=1000
lowmem=$HOME/scratch60/
lowmem_dir=$HOME/scratch60/predictions
trait=bt
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
reverse_log_p=True

## Subset the individuals excluding samples with the code 1415 but without a sub-category code for f20002 self-report

In [2]:
import pandas as pd
sample500k = pd.read_csv("/home/gl2776/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.keep_id",header=None,sep=" ")
sample500k

Unnamed: 0,0,1
0,1000019,1000019
1,1000022,1000022
2,1000035,1000035
3,1000046,1000046
4,1000054,1000054
...,...,...
460644,6025390,6025390
460645,6025409,6025409
460646,6025411,6025411
460647,6025425,6025425


In [3]:
exclusion = pd.read_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/with_1415_without_subcat.500k.sample_id.txt",header=0,sep="\t")
exclusion = exclusion["IID"].to_list()
exclusion[:5]

[1000767, 1002847, 1004257, 1005452, 1009003]

In [4]:
len(exclusion)

4226

In [5]:
sample500k[~sample500k[0].isin(exclusion)].shape

(456561, 2)

In [None]:
sample500k[~sample500k[0].isin(exclusion)].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/ukb47922_white_456561ind_exclu_1415.keep_id",header=False,index=False,sep=" ")

In [13]:
# f3393
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2"
phe = pd.read_csv(file,header=0,sep="\t")
phe

Unnamed: 0,FID,IID,sex,f3393,age,PC1,PC2
0,1000112,1000112,0,1,68,0.011931,0.031198
1,1001067,1001067,0,1,50,0.004472,0.002904
2,1001384,1001384,1,1,61,0.002001,-0.001850
3,1001459,1001459,0,1,64,0.010770,0.022391
4,1002548,1002548,0,1,62,0.000226,-0.014383
...,...,...,...,...,...,...,...
251428,6025363,6025363,0,0,64,0.004982,0.006218
251429,6025409,6025409,1,0,66,0.002693,-0.003802
251430,6025411,6025411,1,0,49,0.014998,0.007031
251431,6025425,6025425,1,0,44,0.017568,0.020817


In [14]:
phe[["FID", "IID"]].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2.keep_id", sep='\t', index=False, header=False)

In [15]:
# f2247
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2"
phe = pd.read_csv(file,header=0,sep="\t")
phe[["FID", "IID"]].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2.keep_id", sep='\t', index=False, header=False)
# f2257
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2"
phe = pd.read_csv(file,header=0,sep="\t")
phe[["FID", "IID"]].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2.keep_id", sep='\t', index=False, header=False)
# f2247_f2257
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2"
phe = pd.read_csv(file,header=0,sep="\t")
phe[["FID", "IID"]].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2.keep_id", sep='\t', index=False, header=False)

In [9]:
# phenopca
import pandas as pd
# f3393
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2"
list1415 = pd.read_csv(file,header=0,sep="\t")["IID"].to_list()
file = "/home/gl2776/UKBiobank/results/092821_PCA_results_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k.phenopca"
phe = pd.read_csv(file,header=0,sep="\t")
phe[phe["IID"].isin(list1415)].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k.phenopca", sep='\t', index=False)

In [8]:
# f2247
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2"
list1415 = pd.read_csv(file,header=0,sep="\t")["IID"].to_list()
file = "/home/gl2776/UKBiobank/results/092821_PCA_results_500K/100521_UKBB_Hearing_difficulty_f2247_expandedwhite_110453cases_237318ctrl_500k.phenopca"
phe = pd.read_csv(file,header=0,sep="\t")
phe[phe["IID"].isin(list1415)].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k.phenopca", sep='\t', index=False)
# f2247
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2"
list1415 = pd.read_csv(file,header=0,sep="\t")["IID"].to_list()
file = "/home/gl2776/UKBiobank/results/092821_PCA_results_500K/100521_UKBB_Hearing_noise_f2257_expandedwhite_161443cases_237318ctrl_500k.phenopca"
phe = pd.read_csv(file,header=0,sep="\t")
phe[phe["IID"].isin(list1415)].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k.phenopca", sep='\t', index=False)
# f2247
file = "/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2"
list1415 = pd.read_csv(file,header=0,sep="\t")["IID"].to_list()
file = "/home/gl2776/UKBiobank/results/092821_PCA_results_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k.phenopca"
phe = pd.read_csv(file,header=0,sep="\t")
phe[phe["IID"].isin(list1415)].to_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k.phenopca", sep='\t', index=False)

## Calculate the first 2 PC's for each of these phenotypes to run the association analysis with the imputed data

### 500k Data

#### f.3393

##### Step 1

In [7]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393
#gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/qc1_f3393_qcarray_$(date +"%Y-%m-%d").sbatch
gwas_sbatch=$cwd/qc1_f3393_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2.keep_id
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393/qc1_f3393_qcarray_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=we6b0c18b257f3d69) is executed successfully with 1 completed step.


##### Step 2.

In [5]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393
#This is the bfile obtained in step 1
genoFile=$cwd/*.bed
# Format FID, IID, pop
phenoFile=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f3393_pc_$(date +"%Y-%m-%d").sbatch
k=2
maha_k=2
min_axis=""
max_axis=""

pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --maha_k $maha_k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393/flashpca_f3393_pc_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=w33c3970745c7458e) is executed successfully with 1 completed step.


#### f.2247

##### Step 1

In [7]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247
#gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/qc1_f2247_qcarray_$(date +"%Y-%m-%d").sbatch
gwas_sbatch=$cwd/qc1_f2247_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2.keep_id
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247/qc1_f2247_qcarray_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=w0480799a289fd655) is executed successfully with 1 completed step.


##### Step 2.

In [14]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247
#This is the bfile obtained in step 1
genoFile=$cwd/*.bed
# Format FID, IID, pop
phenoFile=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2247_pc_$(date +"%Y-%m-%d").sbatch
k=2
maha_k=2
min_axis=""
max_axis=""

pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --maha_k $maha_k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247/flashpca_f2247_pc_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=wa728fcf455ac0e6a) is executed successfully with 1 completed step.


#### f.2257

##### Step 1

In [15]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2257
#gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/qc1_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
gwas_sbatch=$cwd/qc1_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2.keep_id
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2257/qc1_f2257_qcarray_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=w86710bcece4549a1) is executed successfully with 1 completed step.


##### Step 2.

In [16]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2257
#This is the bfile obtained in step 1
genoFile=$cwd/*.bed
# Format FID, IID, pop
phenoFile=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2257_pc_$(date +"%Y-%m-%d").sbatch
k=2
maha_k=2
min_axis=""
max_axis=""

pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --maha_k $maha_k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2257/flashpca_f2257_pc_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=wd6b8aa394484cf0b) is executed successfully with 1 completed step.


#### Combined f.2247 & f.2257

##### Step 1

In [17]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247_f2257
#gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/qc1_f2247_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
gwas_sbatch=$cwd/qc1_f2247_f2257_qcarray_$(date +"%Y-%m-%d").sbatch
## Use qc'ed genotype array
genoFile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
keep_samples=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2.keep_id
#Keep variants after LD pruning
keep_variants=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated/cache/*092821_ldprun_unrelated.filtered.prune.in

#GWAS QC variables set all of this variables to 0 to avoid doing more filtering
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
numThreads=1
mem='30G'

gwasqc_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/GWAS_QC.ipynb

gwasqc_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --keep_variants $keep_variants
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
    --mem $mem
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247_f2257/qc1_f2247_f2257_qcarray_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=w5aece13fb22324c9) is executed successfully with 1 completed step.


##### Step 2.

In [18]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247_f2257
#This is the bfile obtained in step 1
genoFile=$cwd/*.bed
# Format FID, IID, pop
phenoFile=$UKBB_PATH/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071122_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k.phenopca
label_col=ethnicity
pop_col=ethnicity
pca_sbatch=$cwd/flashpca_f2247_f2257_pc_$(date +"%Y-%m-%d").sbatch
k=2
maha_k=2
min_axis=""
max_axis=""

pca_sos=$USER_PATH/xqtl-pipeline/code/data_preprocessing/genotype/PCA.ipynb

pca_args="""flashpca
    --cwd $cwd
    --genoFile $genoFile
    --phenoFile $phenoFile
    --k $k
    --maha_k $maha_k
    --label_col $label_col
    --pop_col $pop_col
    --min_axis $min_axis
    --max_axis $max_axis
    --numThreads $numThreads 
    --job_size $job_size
    --container $container_lmm
"""

sos run  $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $pca_sos \
    --to-script $pca_sbatch \
    --args "$pca_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247_f2257/flashpca_f2247_f2257_pc_2022-07-11.sbatch[0m
INFO: Workflow csg (ID=w0ca2ac5cfecfd1ca) is executed successfully with 1 completed step.


## Combine the PCs in the phenotype files

In [1]:
ls /home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI

070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2
070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2.keep_id
070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2
070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2.keep_id
070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2
070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2.keep_id
070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2
070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2.keep_id
071122_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k.phenopca
071122_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k.phenopca
071122_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k.phenopca
071122_UKBB_Hearing_noise_f2257_expandedwhite_159033cases

Phenotype files named "070722_*_500k_PC1_PC2" are with original PCs calculated by original sample size with code 1415. "071222_*_500k_PC1_PC2" will be the new phenotype files with the newly calculated PCs

In [2]:
# f3393
import pandas as pd
phe = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2",header=0,sep="\t")
phe

Unnamed: 0,FID,IID,sex,f3393,age,PC1,PC2
0,1000112,1000112,0,1,68,0.011931,0.031198
1,1001067,1001067,0,1,50,0.004472,0.002904
2,1001384,1001384,1,1,61,0.002001,-0.001850
3,1001459,1001459,0,1,64,0.010770,0.022391
4,1002548,1002548,0,1,62,0.000226,-0.014383
...,...,...,...,...,...,...,...
251428,6025363,6025363,0,0,64,0.004982,0.006218
251429,6025409,6025409,1,0,66,0.002693,-0.003802
251430,6025411,6025411,1,0,49,0.014998,0.007031
251431,6025425,6025425,1,0,44,0.017568,0.020817


In [4]:
pc = pd.read_csv("~/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393/071122_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k.pca.txt",header=0,sep="\t")
pc

Unnamed: 0,IID,FID,ethnicity,ID,PC1,PC2
0,1000063,1000063,British,1000063:1000063,-0.001268,0.011031
1,1000078,1000078,British,1000078:1000078,-0.006248,0.009816
2,1000081,1000081,British,1000081:1000081,0.118765,-0.016446
3,1000112,1000112,British,1000112:1000112,-0.011938,-0.031102
4,1000129,1000129,British,1000129:1000129,-0.000505,0.011894
...,...,...,...,...,...,...
251428,6025363,6025363,British,6025363:6025363,-0.004974,-0.006233
251429,6025409,6025409,British,6025409:6025409,-0.002686,0.003770
251430,6025411,6025411,British,6025411:6025411,-0.015043,-0.007018
251431,6025425,6025425,British,6025425:6025425,-0.017598,-0.020819


In [8]:
phe = phe[['FID', 'IID', 'sex', 'f3393', 'age']]
phe = phe.merge(pc[["IID","PC1","PC2"]],how="left",left_on="IID",right_on="IID")
phe

Unnamed: 0,FID,IID,sex,f3393,age,PC1,PC2
0,1000112,1000112,0,1,68,-0.011938,-0.031102
1,1001067,1001067,0,1,50,-0.004491,-0.002928
2,1001384,1001384,1,1,61,-0.001991,0.001819
3,1001459,1001459,0,1,64,-0.010785,-0.022329
4,1002548,1002548,0,1,62,-0.000190,0.014417
...,...,...,...,...,...,...,...
251428,6025363,6025363,0,0,64,-0.004974,-0.006233
251429,6025409,6025409,1,0,66,-0.002686,0.003770
251430,6025411,6025411,1,0,49,-0.015043,-0.007018
251431,6025425,6025425,1,0,44,-0.017598,-0.020819


In [10]:
phe.to_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2",index=False,sep="\t")

In [11]:
# f2247
phe = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2",header=0,sep="\t")
pc = pd.read_csv("~/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247/071122_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k.pca.txt",header=0,sep="\t")
phe = phe[['FID', 'IID', 'sex', 'f2247', 'age']]
phe = phe.merge(pc[["IID","PC1","PC2"]],how="left",left_on="IID",right_on="IID")
phe.to_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2",index=False,sep="\t")
# f2257
phe = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2",header=0,sep="\t")
pc = pd.read_csv("~/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f3393/071122_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k.pca.txt",header=0,sep="\t")
phe = phe[['FID', 'IID', 'sex', 'f2257', 'age']]
phe = phe.merge(pc[["IID","PC1","PC2"]],how="left",left_on="IID",right_on="IID")
phe.to_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2",index=False,sep="\t")
# f2247_f2257
phe = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/070722_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2",header=0,sep="\t")
pc = pd.read_csv("~/UKBiobank/results/092821_PCA_results_500K/071122_PCA_500k_exclu_1415/f2247_f2257/071122_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k.pca.txt",header=0,sep="\t")
phe = phe[['FID', 'IID', 'sex', 'f2247_f2257', 'age']]
phe = phe.merge(pc[["IID","PC1","PC2"]],how="left",left_on="IID",right_on="IID")
phe.to_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2",index=False,sep="\t")

## Run association analysis with imputed data for each phenotype

### 500k

#### f.3393

In [9]:
cwd=~/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f3393
lmm_dir_regenie=$cwd
lmm_sbatch_regenie=$cwd/f3393_500K_impdata_regenie_$(date +"%Y-%m-%d").sbatch
phenoFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2
covarFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_aid_f3393_expandedwhite_14734cases_236699ctrl_500k_PC1_PC2
phenoCol=f3393
covarCol=sex
qCovarCol="age PC1 PC2"
bgenMinINFO=0.8
bgenMinMAF=0.001
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
#Use the original bed files for the genotype array on regenie step1
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the imputed data hg19 to run the association analysis
genoFile=`echo ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample
label_annotate=SNP
lowmem_dir=$cwd/predictions

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --label_annotate $label_annotate
    --no-annotate
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f3393/f3393_500K_impdata_regenie_2022-07-12.sbatch[0m
INFO: Workflow csg (ID=w4a3815c4dd568e8d) is executed successfully with 1 completed step.


#### f.2247

In [10]:
cwd=~/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2247
lmm_dir_regenie=$cwd
lmm_sbatch_regenie=$cwd/f2247_500K_impdata_regenie_$(date +"%Y-%m-%d").sbatch
phenoFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2
covarFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_difficulty_f2247_expandedwhite_107994cases_236699ctrl_500k_PC1_PC2
phenoCol=f2247
covarCol=sex
qCovarCol="age PC1 PC2"
bgenMinINFO=0.8
bgenMinMAF=0.001
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
#Use the original bed files for the genotype array on regenie step1
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the imputed data hg19 to run the association analysis
genoFile=`echo ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample
label_annotate=SNP
lowmem_dir=$cwd/predictions

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --label_annotate $label_annotate
    --no-annotate
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2247/f2247_500K_impdata_regenie_2022-07-12.sbatch[0m
INFO: Workflow csg (ID=w329a5209646a2078) is executed successfully with 1 completed step.


#### f.2257

In [11]:
cwd=~/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2257
lmm_dir_regenie=$cwd
lmm_sbatch_regenie=$cwd/f2257_500K_impdata_regenie_$(date +"%Y-%m-%d").sbatch
phenoFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2
covarFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Hearing_noise_f2257_expandedwhite_159033cases_236699ctrl_500k_PC1_PC2
phenoCol=f2257
covarCol=sex
qCovarCol="age PC1 PC2"
bgenMinINFO=0.8
bgenMinMAF=0.001
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
#Use the original bed files for the genotype array on regenie step1
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the imputed data hg19 to run the association analysis
genoFile=`echo ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample
label_annotate=SNP
lowmem_dir=$cwd/predictions

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --label_annotate $label_annotate
    --no-annotate
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2257/f2257_500K_impdata_regenie_2022-07-12.sbatch[0m
INFO: Workflow csg (ID=wdbffea273c2939f5) is executed successfully with 1 completed step.


#### Combined f.2247 & f.2257

In [12]:
cwd=~/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2247_f2257
lmm_dir_regenie=$cwd
lmm_sbatch_regenie=$cwd/f2247_f2257_500K_impdata_regenie_$(date +"%Y-%m-%d").sbatch
phenoFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2
covarFile=$hearing_pheno_path/fulldb_500K/pleiotropy_AD_ARHI/071222_UKBB_Combined_f2247_f2257_expandedwhite_91092cases_236699ctrl_500k_PC1_PC2
phenoCol=f2247_f2257
covarCol=sex
qCovarCol="age PC1 PC2"
bgenMinINFO=0.8
bgenMinMAF=0.001
minMAC=4
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
#Use the original bed files for the genotype array on regenie step1
bfile=$UKBB_PATH/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
# Use the imputed data hg19 to run the association analysis
genoFile=`echo ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr{1..22}_v3.bgen`
sampleFile=$UKBB_yale/ukb39554_imputeddataset/ukb32285_imputedindiv.sample
label_annotate=SNP
lowmem_dir=$cwd/predictions

lmm_args="""regenie
    --cwd $lmm_dir_regenie 
    --bfile $bfile 
    --genoFile $genoFile
    --sampleFile $sampleFile
    --phenoFile $phenoFile 
    --formatFile $formatFile_regenie 
    --phenoCol $phenoCol
    --covarCol $covarCol  
    --qCovarCol $qCovarCol
    --bsize $bsize
    --lowmem_dir $lowmem_dir
    --trait $trait 
    --bgenMinMAF $bgenMinMAF 
    --bgenMinINFO $bgenMinINFO
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --minMAC $minMAC
    --job_size $lmm_job_size
    --ylim $ylim
    --label_annotate $label_annotate
    --no-annotate
    --reverse_log_p $reverse_log_p
    --numThreads $numThreads
    --container_lmm $container_lmm
    --container_marp $container_marp
"""

sos run $USER_PATH/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $lmm_sos \
    --to-script $lmm_sbatch_regenie \
    --args "$lmm_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/gl2776/UKBiobank/results/REGENIE_results/results_imputed_data/071222_500k_without_1415/f2247_f2257/f2247_f2257_500K_impdata_regenie_2022-07-12.sbatch[0m
INFO: Workflow csg (ID=w2037a78a2962a0b7) is executed successfully with 1 completed step.


## Do liftover of summary stats for 300K imputed data for HI

### f.3393

In [21]:
liftover_sos=~/project/UKBB_GWAS_dev/workflow/liftover_hg19tohg38.ipynb
lmm_dir_regenie=$lmm_imp_dir_regenie/2021_10_13_f3393_300K/
input_path=$lmm_imp_dir_regenie/2021_10_13_f3393_300K/100521_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141007ctrl_300k_PC1_PC2_f3393.regenie.snp_stats.gz
liftover_sbatch=~/project/UKBB_GWAS_dev/output/f3393_liftover_300K_$(date +"%Y-%m-%d").sbatch

liftover="""default
    --cwd $lmm_dir_regenie 
    --input_path $input_path
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $liftover_sos \
    --to-script $liftover_sbatch \
    --args "$liftover"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/f3393_liftover_300K_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=wecd8e7289a8fac7e) is executed successfully with 1 completed step.


### f.2247

In [29]:
liftover_sos=~/project/UKBB_GWAS_dev/workflow/liftover_hg19tohg38.ipynb
lmm_dir_regenie=$lmm_imp_dir_regenie/2021_10_13_f2247_300K/
input_path=$lmm_imp_dir_regenie/2021_10_13_f2247_300K/100521_UKBB_Hearing_difficulty_f2247_expandedwhite_64953cases_141007ctrl_300k_PC1_PC2_f2247.regenie.snp_stats.gz
liftover_sbatch=~/project/UKBB_GWAS_dev/output/f2247_liftover_300K_$(date +"%Y-%m-%d").sbatch

liftover="""default
    --cwd $lmm_dir_regenie 
    --input_path $input_path
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $liftover_sos \
    --to-script $liftover_sbatch \
    --args "$liftover"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/f2247_liftover_300K_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=we1f3277392e2c31b) is executed successfully with 1 completed step.


### f.2257

In [22]:
liftover_sos=~/project/UKBB_GWAS_dev/workflow/liftover_hg19tohg38.ipynb
lmm_dir_regenie=$lmm_imp_dir_regenie/2021_10_13_f2257_300K/
input_path=$lmm_imp_dir_regenie/2021_10_13_f2257_300K/100521_UKBB_Hearing_noise_f2257_expandedwhite_95794cases_141007ctrl_300k_PC1_PC2_f2257.regenie.snp_stats.gz
liftover_sbatch=~/project/UKBB_GWAS_dev/output/f2257_liftover_300K_$(date +"%Y-%m-%d").sbatch

liftover="""default
    --cwd $lmm_dir_regenie 
    --input_path $input_path
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $liftover_sos \
    --to-script $liftover_sbatch \
    --args "$liftover"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/f2257_liftover_300K_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=w031d9249c62a2e2a) is executed successfully with 1 completed step.


### f.2247 & f.2257

In [23]:
liftover_sos=~/project/UKBB_GWAS_dev/workflow/liftover_hg19tohg38.ipynb
lmm_dir_regenie=$lmm_imp_dir_regenie/2021_10_13_f2247_f2257_300K/
input_path=$lmm_imp_dir_regenie/2021_10_13_f2247_f2257_300K/100521_UKBB_Combined_f2247_f2257_expandedwhite_54853cases_141007ctrl_300k_PC1_PC2_f2247_f2257.regenie.snp_stats.gz
liftover_sbatch=~/project/UKBB_GWAS_dev/output/f2247_f2257_liftover_300K_$(date +"%Y-%m-%d").sbatch

liftover="""default
    --cwd $lmm_dir_regenie 
    --input_path $input_path
    --container_lmm $container_lmm
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $liftover_sos \
    --to-script $liftover_sbatch \
    --args "$liftover"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/f2247_f2257_liftover_300K_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=w22a470d981327bc2) is executed successfully with 1 completed step.


## Do the hudson plot to illustrate the 200K exomes vs the 300K imputed data individuals

### f.3393

In [33]:
hudson_sos=~/project/bioworkflows/GWAS/Hudson_Plot.ipynb
hudson_dir=$UKBB_PATH/results/hudson_plots/f3393_200Kexome_300kimp_hg38
hudson_sbatch=~/output/hudson/f3393_exome200K_imp300K_hg38_hudson_$(date +"%Y-%m-%d").sbatch
sumstats_1=$UKBB_PATH/results/REGENIE_results/results_exome_data/090921_f3393_hearing_aid_200K/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats.gz
sumstats_2=$UKBB_PATH/results/REGENIE_results/results_imputed_data/2021_10_13_f3393_300K/100521_UKBB_Hearing_aid_f3393_expandedwhite_9165cases_141007ctrl_300k_PC1_PC2_f3393.regenie.hg38.sumstats_mapped.gz
toptitle="f3393_exome_200K"
bottomtitle="f3393_imputed_300K"
phenocol1="f3393_exome"
phenocol2="f3393_imp"
highlight_p_top=5e-08
highlight_p_bottom=5e-08
pval_filter=5e-08
job_size=1
container_lmm=~/containers/lmm.sif

hudson_args="""hudson
    --cwd $hudson_dir
    --sumstats_1 $sumstats_1
    --sumstats_2 $sumstats_2
    --toptitle $toptitle
    --bottomtitle $bottomtitle
    --phenocol1 $phenocol1
    --phenocol2 $phenocol2
    --job_size $job_size
    --highlight_p_top $highlight_p_top
    --highlight_p_bottom $highlight_p_bottom
    --pval_filter $pval_filter
    --container_lmm $container_lmm
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb  csg\
    --template-file $tpl_file \
    --workflow-file $hudson_sos \
    --to-script $hudson_sbatch \
    --args "$hudson_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/output/hudson/f3393_exome200K_imp300K_hg38_hudson_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=wf91a4e14d8a92db2) is executed successfully with 1 completed step.


### f.2247

In [34]:
hudson_sos=~/project/bioworkflows/GWAS/Hudson_Plot.ipynb
hudson_dir=$UKBB_PATH/results/hudson_plots/f2247_200Kexome_300kimp_hg38
hudson_sbatch=~/output/hudson/f2247_imp300K_exome200K_hg38_hudson_$(date +"%Y-%m-%d").sbatch
sumstats_1=$UKBB_PATH/results/REGENIE_results/results_exome_data/090921_f2247_hearing_difficulty_200K/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl_PC1_2_f2247.regenie.snp_stats.gz
sumstats_2=$UKBB_PATH/results/REGENIE_results/results_imputed_data/2021_10_13_f2247_300K/*.sumstats_mapped.gz
toptitle="f2247_exome_200K"
bottomtitle="f2247_imputed_300K"
phenocol1="f2247_exome"
phenocol2="f2247_imp"
highlight_p_top=5e-08
highlight_p_bottom=5e-08
pval_filter=5e-08
job_size=1
container_lmm=~/containers/lmm.sif

hudson_args="""hudson
    --cwd $hudson_dir
    --sumstats_1 $sumstats_1
    --sumstats_2 $sumstats_2
    --toptitle $toptitle
    --bottomtitle $bottomtitle
    --phenocol1 $phenocol1
    --phenocol2 $phenocol2
    --job_size $job_size
    --highlight_p_top $highlight_p_top
    --highlight_p_bottom $highlight_p_bottom
    --pval_filter $pval_filter
    --container_lmm $container_lmm
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb  csg\
    --template-file $tpl_file \
    --workflow-file $hudson_sos \
    --to-script $hudson_sbatch \
    --args "$hudson_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/output/hudson/f2247_imp300K_exome200K_hg38_hudson_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=w691d182dabd90aa1) is executed successfully with 1 completed step.


### f.2257

In [31]:
hudson_sos=~/project/bioworkflows/GWAS/Hudson_Plot.ipynb
hudson_dir=$UKBB_PATH/results/hudson_plots/f2257_200Kexome_300kimp_hg38
hudson_sbatch=~/output/hudson/f2257_exome200K_imp300K_hg38_hudson_$(date +"%Y-%m-%d").sbatch
sumstats_1=$UKBB_PATH/results/REGENIE_results/results_exome_data/090921_f2257_hearing_noise_200K/*.regenie.snp_stats.gz
sumstats_2=$UKBB_PATH/results/REGENIE_results/results_imputed_data/2021_10_13_f2257_300K/*.sumstats_mapped.gz
toptitle="f2257_exome_200K"
bottomtitle="f2257_imputed_300K"
phenocol1="f2257_exome"
phenocol2="f2257_imp"
highlight_p_top=5e-08
highlight_p_bottom=5e-08
pval_filter=5e-08
job_size=1
container_lmm=~/containers/lmm.sif

hudson_args="""hudson
    --cwd $hudson_dir
    --sumstats_1 $sumstats_1
    --sumstats_2 $sumstats_2
    --toptitle $toptitle
    --bottomtitle $bottomtitle
    --phenocol1 $phenocol1
    --phenocol2 $phenocol2
    --job_size $job_size
    --highlight_p_top $highlight_p_top
    --highlight_p_bottom $highlight_p_bottom
    --pval_filter $pval_filter
    --container_lmm $container_lmm
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb  csg\
    --template-file $tpl_file \
    --workflow-file $hudson_sos \
    --to-script $hudson_sbatch \
    --args "$hudson_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/output/hudson/f2257_exome200K_imp300K_hg38_hudson_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=w10d0794e6132b078) is executed successfully with 1 completed step.


### Combined f.2247 & f.2257

In [32]:
hudson_sos=~/project/bioworkflows/GWAS/Hudson_Plot.ipynb
hudson_dir=$UKBB_PATH/results/hudson_plots/f2247_f2257_200Kexome_300kimp_hg38
hudson_sbatch=~/output/hudson/f2247_f2257_exome200K_imp300K_hg38_hudson_$(date +"%Y-%m-%d").sbatch
sumstats_1=$UKBB_PATH/results/REGENIE_results/results_exome_data/090921_combined_f2247_f2257_200K/*.regenie.snp_stats.gz
sumstats_2=$UKBB_PATH/results/REGENIE_results/results_imputed_data/2021_10_13_f2247_f2257_300K/*.sumstats_mapped.gz
toptitle="combined_exome_200K"
bottomtitle="combined_imputed_300K"
phenocol1="f2247_f2257_exome"
phenocol2="f2247_f2257_imp"
highlight_p_top=5e-08
highlight_p_bottom=5e-08
pval_filter=5e-08
job_size=1
container_lmm=~/containers/lmm.sif

hudson_args="""hudson
    --cwd $hudson_dir
    --sumstats_1 $sumstats_1
    --sumstats_2 $sumstats_2
    --toptitle $toptitle
    --bottomtitle $bottomtitle
    --phenocol1 $phenocol1
    --phenocol2 $phenocol2
    --job_size $job_size
    --highlight_p_top $highlight_p_top
    --highlight_p_bottom $highlight_p_bottom
    --pval_filter $pval_filter
    --container_lmm $container_lmm
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb  csg\
    --template-file $tpl_file \
    --workflow-file $hudson_sos \
    --to-script $hudson_sbatch \
    --args "$hudson_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/output/hudson/f2247_f2257_exome200K_imp300K_hg38_hudson_2021-11-14.sbatch[0m
INFO: Workflow csg (ID=w50811e04a77ee223) is executed successfully with 1 completed step.


## Use Yin's pipeline to:

1. Do liftover of the GRhC 37 coordinates of the imputed data to GRhC 38
2. Merge imputed and exome genotype data
3. Do fine mapping analysis. 