In [None]:
from datetime import date, datetime

import os
import numpy as np
import pandas as pd
import pandas
import pandas_profiling
import plotnine
from plotnine import *  # Provides a ggplot-like interface to matplotlib.
import matplotlib.pyplot as plt
import subprocess
import seaborn as sns
import math
import datetime

## GWAS summary generation Step for AllofUS

### SAIGE - Step1 - GRM and Null Model Fitting

In [None]:
traits = pd.read_csv('./Phenotypes/eur_basic_traits.tsv', sep = '\t')
traits = traits.columns[15:124]
num_traits = len(traits)
print(num_traits)


In [None]:
bucket = os.getenv('WORKSPACE_BUCKET')
gwas_dir = 'SAIGE_GENE'
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'saige-step1-{USER_NAME}'
%env JOB_NAME={JOB_NAME}
%env PHEN={phen_name}

params_df = pd.DataFrame(data={
    '--input-recursive INPUT_DIR': [f"{bucket}/{gwas_dir}/step1_input/"]*num_traits,
    '--input-recursive GRM_DIR': [f"{bucket}/{gwas_dir}/step0_output/"]*num_traits,
    '--output-recursive OUT_DIR': [f"{bucket}/{gwas_dir}/step1_output/"]*num_traits,
    '--env traitType':['quantitative' if x.__contains__('f.') else 'binary' for x in traits],
    '--env invNormalize':[True if x.__contains__('f.') else False for x in traits],    
    '--env PHEN': traits
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_output = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}_${PHEN}" \
  --provider google-cls-v2 \
  --image "wzhou88/saige:1.3.0" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/saige_step1" \
  --boot-disk-size 50 \
  --disk-size 128 \
  --min-ram 32 \
  --min-cores 8 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'step1_fitNULLGLMM.R \
                --sparseGRMFile=$GRM_DIR/sparseGRM_relatednessCutoff_0.125_1000_randomMarkersUsed.sparseGRM.mtx \
                --sparseGRMSampleIDFile=$GRM_DIR/sparseGRM_relatednessCutoff_0.125_1000_randomMarkersUsed.sparseGRM.mtx.sampleIDs.txt \
                --useSparseGRMtoFitNULL=TRUE \
                --bedFile=$INPUT_DIR/pruned_arrays_eur.bed \
                --bimFile=$INPUT_DIR/pruned_arrays_eur_new.bim \
                --famFile=$INPUT_DIR/pruned_arrays_eur.fam \
                --phenoFile=$INPUT_DIR/eur_basic_traits.tsv  \
                --invNormalize=$invNormalize \
                --phenoCol=${PHEN} \
                --covarColList=SEX,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 \
                --qCovarColList=SEX \
                --sampleIDColinphenoFile=IID \
                --traitType=$traitType \
                --isCateVarianceRatio=TRUE \
                --outputPrefix=$OUT_DIR/${PHEN}_step1_output \
                --IsOverwriteVarianceRatioFile=TRUE \
                --LOCO=FALSE \
                --nThreads=8'
job_output

### SAIGE - Step2 - Single Variant Association Testing

In [None]:

# Step 2: Single variant association testing

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'saige-step2-{USER_NAME}'
%env JOB_NAME={JOB_NAME}
bucket = os.getenv('WORKSPACE_BUCKET')
gwas_dir = 'SAIGE_GENE'


exome_bgen_dir = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/bgen"

params_df = pd.DataFrame(data={
    '--input-recursive STEP1_OUT': [f"{bucket}/{gwas_dir}/step1_output" for _ in range(22*num_traits)],
    '--input bgenFile': [f"{exome_bgen_dir}/exome.chr{x+1}.bgen" for x in range(22)]*num_traits,
    '--input bgenFileIndex': [f"{exome_bgen_dir}/exome.chr{x+1}.bgen.bgi" for x in range(22)]*num_traits,
    '--input sampleFile': [f"{exome_bgen_dir}/exome.chr{x+1}.sample" for x in range(22)]*num_traits,
    '--input groupFile': [f"{bucket}/{gwas_dir}/step2_input/UKB470k_chr_{x+1}_groupfile.loftee.edit.txt" for x in range(22)]*num_traits,
    '--env PHEN': [x for x in traits for _ in range(22)],
    '--env CHROM': [x+1 for x in range(22)]*num_traits,
    '--output-recursive OUT_DIR': [f"{bucket}/{gwas_dir}/step2_output/" for _ in range(22*num_traits)]
})


PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)
job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-batch \
  --image "wzhou88/saige:1.3.0" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/saige_step2" \
  --boot-disk-size 100 \
  --disk-size 200 \
  --min-ram 32 \
  --tasks "${PARAMETER_FILENAME}" \
  --command 'step2_SPAtests.R \
                --bgenFile=${bgenFile} \
                --bgenFileIndex=${bgenFileIndex} \
                --minMAF=0 \
                --AlleleOrder=ref-first \
                --is_output_moreDetails=TRUE \
                --is_overwrite_output=TRUE \
                --chrom=chr${CHROM} \
                --GMMATmodelFile=$STEP1_OUT/${PHEN}_step1_output.rda \
                --varianceRatioFile=$STEP1_OUT/${PHEN}_step1_output.varianceRatio.txt \
                --LOCO=FALSE \
                --SAIGEOutputFile=$OUT_DIR/${PHEN}_chr${CHROM}_step2_output_single.txt \
                '
print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## Sparse LD Matrix Generation

In [None]:
cmd_line = 'step3_LDmat.R \
    --bgenFile=${bgenFile} \
    --bgenFileIndex=${bgenFileIndex} \
    --sample_include_inLDMat_File=$ids \
    --AlleleOrder=ref-first \
    --chrom=$CHROM \
    --SAIGEOutputFile=$OUT_DIR/${maf}_${anno}_chr${CHR} \
    --groupFile=$groupFile/UKB470k_chr_${CHR}_groupfile.txt \
    --annotation_in_groupTest=${annotation_in_groupTest} \
    --is_overwrite_output=TRUE \
    --maxMAF_in_groupTest=${maf} \
    '
with open("Cmd_step3.sh", "w") as text_file:
    text_file.write(cmd_line)

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'saige-step3-{USER_NAME}'
%env JOB_NAME={JOB_NAME}
bucket = os.getenv('WORKSPACE_BUCKET')
gwas_dir = 'SAIGE_GENE'

exome_plink_dir = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome/plink_bed"

exome_bgen_dir = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/bgen"
anno = ['missense_lof_synonymous']
chr = [x+1 for x in range(22)]
mafs=([0.01)*22
i=0
print(f"{bucket}/{gwas_dir}/step3_output/{mafs[i]}_{anno[i]}_chr{str(chr[i])}/")
params_df = pd.DataFrame(data={
    '--input ids': [f"{bucket}/{gwas_dir}/step1_input/ehr_IDs.txt" for _ in range(22)],
    '--input bgenFile': [f"{exome_bgen_dir}/exome.chr{x+1}.bgen" for x in range(22)],
    '--input bgenFileIndex': [f"{exome_bgen_dir}/exome.chr{x+1}.bgen.bgi" for x in range(22)],
    '--input-recursive groupFile': [f"{bucket}/{gwas_dir}/step2_input/" for _ in range(22)],
    '--env maf': mafs,
    '--env CHR': chr,
    '--env CHROM': ['chr'+str(x+1) for x in range(22)],
    '--output-recursive OUT_DIR': [f"{bucket}/{gwas_dir}/step3_output" for i in range(22)],
    '--env annotation_in_groupTest': ['missense;lof;synonymous']*22,
    '--env anno': anno
})

PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}

params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-batch \
  --regions us-central1 \
  --image "wzhou88/saige:1.3.0" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/saige_step3" \
  --boot-disk-size 50 \
  --disk-size 100 \
  --min-ram 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --script "Cmd_step3.sh"
print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}


## Meta-SAIGE Analysis between AllofUS and UKB (EUR)

In [None]:
cmd_line = '''INPUT_DIR1=${BUCKET}/SAIGE_GENE/imported/step3_docker/step3_docker/WES470k_${maf}_${anno}_chr${chrom}/
INPUT_DIR2=${BUCKET}/SAIGE_GENE/step3_output/${maf}_${anno}_chr${chrom}/
INPUT_GWAS1=${BUCKET}/SAIGE_GENE/imported/step2_phenome/step2/WES470k_Whites_${Phecode}_chr${chrom}_gene
INPUT_GWAS2=${BUCKET}/SAIGE_GENE/step2_output/Pheno_${Phecode}_chr${chrom}_step2_output_single.txt
head -1 $INPUT_GWAS1 > /app/tmp_GWAS1.txt ; tail -n+2 $INPUT_GWAS1 | awk 'BEGIN{OFS="\\t"}$1="chr"$1, $3="chr"$3'  >> /app/tmp_GWAS1.txt 
head -1 $INPUT_DIR1/WES470k_chr${chrom}.marker_info.txt > /app/tmp_mkr_info1.txt ; tail -n+2 $INPUT_DIR1/WES470k_chr${chrom}.marker_info.txt | awk 'BEGIN{OFS="\\t"}$1="chr"$1' >> /app/tmp_mkr_info1.txt 
/usr/bin/time --verbose Rscript /app/RV_meta_GC.R \
--num_cohorts 2 \
--chr ${chrom} \
--col_co 10 \
--trait_type binary \
--verbose FALSE \
--info_file_path /app/tmp_mkr_info1.txt $INPUT_DIR2/${maf}_${anno}_chr${chrom}_loftee.marker_info.txt \
--gene_file_prefix $INPUT_DIR1/WES470k_chr${chrom}_ $INPUT_DIR2/${maf}_${anno}_chr${chrom}_loftee_ \
--gwas_path /app/tmp_GWAS1.txt $INPUT_GWAS2 \
--output_prefix $OUT_DIR/Meta_${Phecode}_${maf}_${anno}_chr${chrom}_loftee \
    '''
with open("Cmd_meta.sh", "w") as text_file:
    text_file.write(cmd_line)

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
JOB_NAME=f'saige-step4-{USER_NAME}'
%env JOB_NAME={JOB_NAME}
bucket = os.getenv('WORKSPACE_BUCKET')
gwas_dir = 'SAIGE_GENE'

exome_bgen_dir = "gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome/bgen"

phecodes = pd.read_csv('phecodes.txt',header = None, dtype=object)
phecodes = phecodes.drop_duplicates()
phecodes = phecodes[0].tolist()
phecodes_all = list(np.repeat(phecodes, 22*3*3))
tot_len = len(phecodes)*3*3*22
maf=(['0.01']*22+['0.001']*22+['0.0001']*22)*3*len(phecodes)
anno=(['lof']*66+['missense_lof']*66+['missense_lof_synonymous']*66)*len(phecodes)
chrom=[i for i in range(1,23)]*3*3*len(phecodes)

params_df = pd.DataFrame(data={
    '--input-recursive INPUT_DIR1': [f"{bucket}/{gwas_dir}/imported/step3_docker/step3_docker/WES470k_{maf[i]}_{anno[i]}_chr{chrom[i]}/" for i in range(tot_len)],
    '--input-recursive INPUT_DIR2': [f"{bucket}/{gwas_dir}/step3_output/{maf[i]}_{anno[i]}_chr{chrom[i]}/" for i in range(tot_len)],
    '--input INPUT_GWAS1': [f"{bucket}/{gwas_dir}/imported/step2_phenome/step2/WES470k_Whites_{phecodes_all[i]}_chr{chrom[i]}_gene" for i in range(tot_len)],
    '--input INPUT_GWAS2': [f"{bucket}/{gwas_dir}/step2_output/Pheno_{phecodes_all[i]}_chr{chrom[i]}_step2_output_single.txt" for i in range(tot_len)],
    '--env maf': maf,
    '--env Phecode': phecodes_all,
    '--output-recursive OUT_DIR': [f"{bucket}/{gwas_dir}/step4_output/" for _ in range(tot_len)],
    '--env anno': anno,
    '--env chrom': chrom,    
})
PARAMETER_FILENAME = f'{JOB_NAME}_params.tsv'
%env PARAMETER_FILENAME={PARAMETER_FILENAME}
%env IMAGE_PATH={} # Please supply
params_df.to_csv(PARAMETER_FILENAME, sep='\t', index=False)

job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-batch \
  --regions us-central1 \
  --image "${IMAGE_PATH}/meta-saige:0.3.0" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/saige_step4" \
  --boot-disk-size 30 \
  --disk-size 50 \
  --min-ram 10 \
  --tasks "${PARAMETER_FILENAME}" \
  --script "Cmd_meta.sh"
#print("\n".join(job_id))
job_id = job_id[1].split(" ")[-1]
%env JOB_ID={job_id}

## Multi-Ancestry, Meta-Analysis with META-SAIGE using AllofUS (EUR, AFR, AMR) and UKB

In [None]:
cmd_line = '''
INPUT_DIR1=${BUCKET}/SAIGE_GENE/imported/step3_docker/step3_docker/WES470k_${maf}_${anno}_chr${i}/
INPUT_DIR2=${BUCKET}/SAIGE_GENE/step3_output/${maf}_${anno}_chr${i}/
INPUT_DIR3=${BUCKET}/SAIGE_GENE/step3_output/afr_${maf}_${anno}_chr${i}/
INPUT_DIR4=${BUCKET}/SAIGE_GENE/step3_output/amr_${maf}_${anno}_chr${i}/
INPUT_GWAS1=${BUCKET}/SAIGE_GENE/imported/step2_phenome/step2/WES470k_Whites_${Phecode}_chr${i}_gene
INPUT_GWAS2=${BUCKET}/SAIGE_GENE/step2_output/Pheno_${Phecode}_chr${i}_step2_output_single.txt
INPUT_GWAS3=${BUCKET}/SAIGE_GENE/step2_output/${phen}_afr_chr${i}_step2_output_single.txt
INPUT_GWAS4=${BUCKET}/SAIGE_GENE/step2_output/${phen}_amr_chr${i}_step2_output_single.txt
head -1 $INPUT_GWAS1 > /app/tmp_chr${i}_GWAS1.txt ; tail -n+2 $INPUT_GWAS1 | awk 'BEGIN{OFS="\\t"}$1="chr"$1, $3="chr"$3'  >> /app/tmp_chr${i}_GWAS1.txt 
head -1 $INPUT_DIR1/WES470k_chr${i}.marker_info.txt > /app/tmp_chr${i}_mkr_info1.txt ; tail -n+2 $INPUT_DIR1/WES470k_chr${i}.marker_info.txt | awk 'BEGIN{OFS="\\t"}$1="chr"$1' >> /app/tmp_chr${i}_mkr_info1.txt 
cp ${BUCKET}/SAIGE_GENE/saige_meta_scripts/Lib_GC.R /app/
cp ${BUCKET}/SAIGE_GENE/saige_meta_scripts/MetaSAIGE.R /app/
chrom=$i
/usr/bin/time --verbose Rscript /app/RV_meta_GC.R \
    --num_cohorts 4 \
    --chr ${chrom} \
    --col_co 10 \
    --info_file_path $INPUT_DIR1/WES470k_chr${chrom}.marker_info.txt $INPUT_DIR2/${maf}_${anno}_chr${chrom}_loftee.marker_info.txt $INPUT_DIR3/afr_${maf}_${anno}_chr${chrom}_loftee.marker_info.txt $INPUT_DIR4/amr_${maf}_${anno}_chr${chrom}_loftee.marker_info.txt \
    --gene_file_prefix $INPUT_DIR1/WES470k_chr${chrom}_ $INPUT_DIR2/${maf}_${anno}_chr${chrom}_loftee_ $INPUT_DIR3/afr_${maf}_${anno}_chr${chrom}_loftee_ $INPUT_DIR4/amr_${maf}_${anno}_chr${chrom}_loftee_ \
    --gwas_path $INPUT_GWAS1 $INPUT_GWAS2 $INPUT_GWAS3 $INPUT_GWAS4 \
    --ancestry 1 1 2 3 \
    --trait_type binary \
    --verbose FALSE \
    --output_prefix $OUT_DIR/Multi_ancestry_${phen}_${maf}_${anno}_chr${chrom}_GCfix
'''
with open("Cmd_ma_recoded.sh", "w") as text_file:
    text_file.write(cmd_line)

In [None]:
USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')
%env USER_NAME={USER_NAME}
bucket = os.getenv('WORKSPACE_BUCKET')
gwas_dir = 'SAIGE_GENE'
maf=(['0.01']*22+['0.001']*22+['0.0001']*22)*3*2
anno=(['lof']*66+['missense_lof']*66+['missense_lof_synonymous']*66)*2
phen = ['T2D']*198+['colorectal']*198
phen_ukb = ['t2d']*198+['colca']*198
phecodes = ['250.2']*198+['153']*198
chrom=[i for i in range(1,23)]*3*3*2
tot_len = len(maf)
params_df0 = pd.DataFrame(data={
    '--env maf': maf,
    '--env phen': phen,
    '--env phen_ukb': phen_ukb,
    '--output-recursive OUT_DIR': [f"{bucket}/{gwas_dir}/step4_output_recode/" for _ in range(tot_len)],
    '--env Phecode': phecodes,
    '--env anno': anno,
    '--env i': chrom,    
})
params_df0=params_df0.loc[(params_df0['--env i'].isin([2]))]
params_df0.to_csv('saige-step4_ma.tsv',sep = '\t', index = False)
%env PARAMETER_FILENAME=saige-step4_ma.tsv
%env JOB_NAME=meta_mult
job_id = !source ~/aou_dsub.bash; aou_dsub \
  --name "${JOB_NAME}" \
  --provider google-cls-v2 \
  --image "gcr.io/pheweb/meta-saige:0.2.3" \
  --logging "${WORKSPACE_BUCKET}/dsub_logs/saige_step4" \
  --boot-disk-size 10 \
  --disk-size 20 \
  --min-ram 15 \
  --tasks "${PARAMETER_FILENAME}" \
  --mount BUCKET="${WORKSPACE_BUCKET}" \
  --script "Cmd_ma_recoded.sh"
#print("\n".join(job_id))
job_id2 = job_id[1].split(" ")[-1]
%env JOB_ID={job_id2}
job_id