# Association analysis imputed data

Do the association analysis for the Hearing impairment traits with the white european individuals present in the 500K samples

## Columbia's cluster

In [1]:
# Common variables Columbia's cluster
UKBB_PATH=$HOME/UKBiobank
UKBB_yale=$HOME/UKBiobank_Yale_transfer
USER_PATH=$HOME/project
container_lmm=$HOME/containers/lmm.sif
container_marp=$HOME/containers/marp.sif
container_annovar=$HOME/containers/gatk4-annovar.sif
hearing_pheno_path=$UKBB_PATH/phenotype_files/hearing_impairment
tpl_file=$USER_PATH/bioworkflows/admin/csg.yml
formatFile_fastgwa=$USER_PATH/UKBB_GWAS_dev/data/fastGWA_template.yml
formatFile_bolt=$USER_PATH/UKBB_GWAS_dev/data/boltlmm_template.yml
formatFile_saige=$USER_PATH/UKBB_GWAS_dev/data/saige_template.yml
formatFile_regenie=$USER_PATH/UKBB_GWAS_dev/data/regenie_template.yml

  msg['msg_id'] = self._parent_header['header']['msg_id']


## Shared variables for workflows and results

In [2]:
# Workflows
lmm_sos=$USER_PATH/bioworkflows/GWAS/LMM.ipynb
anno_sos=$USER_PATH/bioworkflows/variant-annotation/annovar.ipynb
clumping_sos=$USER_PATH/bioworkflows/GWAS/LD_Clumping.ipynb
extract_sos=$USER_PATH/bioworkflows/GWAS/Region_Extraction.ipynb
snptogene_sos=$USER_PATH/UKBB_GWAS_dev/workflow/snptogene.ipynb

# LMM directories for imputed data
lmm_imp_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_imputed_data
lmm_imp_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_imputed_data
lmm_imp_dir_saige=$UKBB_PATH/results/SAIGE_results/results_imputed_data
lmm_imp_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_imputed_data

# LMM directories for exome data
lmm_exome_dir_fastgwa=$UKBB_PATH/results/FastGWA_results/results_exome_data
lmm_exome_dir_bolt=$UKBB_PATH/results/BOLTLMM_results/results_exome_data
lmm_exome_dir_saige=$UKBB_PATH/results/SAIGE_results/results_exome_data
lmm_exome_dir_regenie=$UKBB_PATH/results/REGENIE_results/results_exome_data

In [3]:
## LMM variables 
## Specific to Bolt_LMM
LDscoresFile=$UKBB_PATH/LDSCORE.1000G_EUR.tab.gz
geneticMapFile=$UKBB_PATH/genetic_map_hg19_withX.txt.gz
covarMaxLevels=10
numThreads=20
bgenMinMAF=0.001
bgenMinINFO=0.8
lmm_job_size=1
ylim=0

### Specific to FastGWA (depeding if you run from Yale or Columbia)
####Yale's cluster
grmFile=$UKBB_PATH/results/FastGWA_results/results_imputed_data/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.grm.sp
####Columbia's cluster
grmFile=$UKBB_yale/results/FastGWA_results/results_imputed_data/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.grm.sp

### Specific to SAIGE
bgenMinMAC=4
trait_type=binary
loco=TRUE
sampleCol=IID

### Specific to REGENIE
bsize=1000
lowmem=$HOME/scratch60/
lowmem_dir=$HOME/scratch60/predictions
trait=bt
minMAC=4
maf_filter=0.01
geno_filter=0.01
hwe_filter=0
mind_filter=0.1
reverse_log_p=True

## Subset the individuals that passed the genotype QC from the ukb47922 data

### Subseting with mind 99%

In [2]:
%save /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K_callrate99.R -f
setwd('/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB')
source("/mnt/mfs/statgen/UKBiobank/data/ukbb_databases/ukb47922_updatedAug2021/ukb47922.r")
print('Finished loading database')
nrow(bd)
## Use the fam file from the QC'ed genotype data with N=436,698 using mind=0.01
df.geno <- read.table("/mnt/mfs/statgen/UKBiobank/genotype_files_processed/083021_sample_variant_qc_final/cache/UKB_genotypedatadownloaded083019.083021_sample_variant_qc_final.filtered.extracted.fam", header=FALSE, stringsAsFactors = FALSE)
names(df.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(df.geno)
head(bd[,1, drop=FALSE])
names(bd)[1] <- "IID"
head(bd[,1, drop=FALSE])
df.gen.phen <-merge(df.geno, bd, by="IID", all=FALSE)
print('Subsetting of the database completed')
nrow(df.gen.phen)
write.csv(df.gen.phen,'092821_UKBB_436698ind.csv', row.names = FALSE)
print('Finished writing the csv file')

In [3]:
%save /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K.sh -f
#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=200G
#$ -N subset_ukb47922
#$ -o /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_$JOB_ID.out
#$ -e /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
module load R/4.0
Rscript /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K.R

### Subsetting with mind 90%

In [4]:
%save /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K_callrate90.R -f
setwd('/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB')
source("/mnt/mfs/statgen/UKBiobank/data/ukbb_databases/ukb47922_updatedAug2021/ukb47922.r")
print('Finished loading database')
nrow(bd)
## Use the fam file from the QC'ed genotype data with N=486,416 using mind=0.1
df.geno <- read.table("/mnt/mfs/statgen/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.fam", header=FALSE, stringsAsFactors = FALSE)
names(df.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(df.geno)
head(bd[,1, drop=FALSE])
names(bd)[1] <- "IID"
head(bd[,1, drop=FALSE])
df.gen.phen <-merge(df.geno, bd, by="IID", all=FALSE)
print('Subsetting of the database completed')
nrow(df.gen.phen)
write.csv(df.gen.phen,'092821_UKBB_486416ind_call90.csv', row.names = FALSE)
print('Finished writing the csv file')

In [5]:
%save /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K_call90.sh -f
#!/bin/sh
#$ -l h_rt=36:00:00
#$ -l h_vmem=200G
#$ -N subset_ukb47922_call90
#$ -o /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_call90_$JOB_ID.out
#$ -e /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_call90_$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity/3.5.3
module load R/4.0
Rscript /home/dmc2245/project/UKBB_GWAS_dev/output/subset_ukb47922_500K_callrate90.R

## Run hearing_impairment notebook to get the white Europeans for PCA

After running the notebook `092821_hearing_impairment_500K.ipynb` this file is generated to run the PCA analysis

"/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.pheno" these are the ancestry phenotypes

"/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.keep_id" these are the IID to keep

## Run PCA analysis for the whole sample of white Europeans

### 09-28-21 PCA with QC'ed genotype array data

#### Step 1. Select European individuals form the genotype data

In [4]:
#Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/white_europeans
#bfile with sample and variants QC from 083021 containing all of the samples Columbias's cluster
##here I used the bfile in which individuals with call rate >90% were retained
genoFile=~/UKBiobank/genotype_files_processed/090221_sample_variant_qc_final_callrate90/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.bed
#To keep the samples of white individuals only
keep_samples=~/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.keep_id
#QC is already done, so no need to filter any more
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
mem='30G'
gwasqc_sos=$USER_PATH/bioworkflows/GWAS/GWAS_QC.ipynb
gwasqc_sbatch=$USER_PATH/UKBB_GWAS_dev/output/select_white_europeans_500K_$(date +"%Y-%m-%d").sbatch

gwasqc1_args="""qc:1
    --cwd $cwd
    --genoFile $genoFile
    --keep_samples $keep_samples
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwasqc_sbatch \
    --args "$gwasqc1_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/select_white_europeans_500K_2021-09-28.sbatch[0m
INFO: Workflow csg (ID=w88641f4bec37ee3f) is executed successfully with 1 completed step.


#### Step 2. Run KING

In [7]:
##Columbia's variables
cwd=$UKBB_PATH/results/092821_PCA_results_500K/092821_king
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/white_europeans/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.bed
king_sbatch=$USER_PATH/UKBB_GWAS_dev/output/flashpca_king_extendedwhite_500K_$(date +"%Y-%m-%d").sbatch
kinship=0.0625
gwasqc_sos=$USER_PATH/bioworkflows/GWAS/GWAS_QC.ipynb
numThreads=20
mem='30G'
walltime='36h'

king_args="""king
    --cwd $cwd
    --genoFile $genoFile
    --kinship $kinship
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
    --walltime $walltime
    --no-maximize-unrelated
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $king_sbatch \
    --args "$king_args"

INFO: Running [32mcsg[0m: Configuration for Columbia csg partition cluster
INFO: [32mcsg[0m is [32mcompleted[0m.
INFO: [32mcsg[0m output:   [32m/home/dmc2245/project/UKBB_GWAS_dev/output/flashpca_king_extendedwhite_500K_2021-09-28.sbatch[0m
INFO: Workflow csg (ID=w5327bc6b64d3eeb2) is executed successfully with 1 completed step.


#### Remove related individuals and LD pruning

In [None]:
## Columbia's cluster
cwd=$UKBB_PATH/results/092821_PCA_results_500K/092821_ldprun_unrelated
## Use the qc version of the genotype array with the already filtered 189010 white individuals
genoFile=$UKBB_PATH/results/092821_PCA_results_500K/white_europeans/cache/*.filtered.bed
#To keep the samples of related white individuals only
remove_samples=$UKBB_PATH/results/092821_PCA_results_500K/092821_king/*.related_id

#GWAS QC variables: leave all the variables in 0 so there's no more filtering in the already filtered data
maf_filter=0
geno_filter=0
hwe_filter=0
mind_filter=0
#LD prunning variables
window=50
shift=10
r2=0.1
gwas_sbatch=$USER_PATH/UKBB_GWAS_dev/output/gwas_unrelated_european_$(date +"%Y-%m-%d").sbatch
numThreads=20
mem='30G'

gwasqc_args="""qc
    --cwd $cwd
    --genoFile $genoFile
    --remove_samples $remove_samples
    --maf_filter $maf_filter
    --geno_filter $geno_filter
    --hwe_filter $hwe_filter
    --mind_filter $mind_filter
    --window $window
    --shift $shift
    --r2 $r2
    --numThreads $numThreads 
    --job_size $job_size
    --container_lmm $container_lmm
    --mem $mem
"""

sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg\
    --template-file $tpl_file \
    --workflow-file $gwasqc_sos \
    --to-script $gwas_sbatch \
    --args "$gwasqc_args"

## Get the phenotype files using the 092821_hearing_impairment_500K notebook

## Do liftover of genotype array and imputed data

## Run association analysis with imputed data for each phenotype

## Merge genotype and exome data for the individuals with exomes

This could be region specific to save time and memory