## Create a minimal example from the UKB data

Select 100 individuals from the bed files

1. Individuals to select: `/home/dc2325/scratch60/plink-clumping/samplesID.txt`
2. Filter bgen files from chr21 and chr22 to contain only those individuals:

## Running this notebook

On Yale Farnam cluster,

### Running example for bgen files

```
sos run ~/project/pleiotropy_UKB/workflow/MWE.ipynb qctool \
    --cwd ~/scratch60/plink-clumping/MWE \
    --bfile /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindivs.bed \
    --genoFile `echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb39554_imputeddataset/ukb_imp_chr{21..22}_v3.bgen` \
    --sampleFile /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb39554_imputeddataset/ukb32285_imputedindiv.sample \
    --rsid `echo ~/scratch60/plink-clumping/chr{21..22}_filter_snps.txt` \
    --samplesPlink /home/dc2325/scratch60/plink-clumping/samples_plink.txt \
    --samplesID /home/dc2325/scratch60/plink-clumping/samplesID.txt \
    --samplesQctool /home/dc2325/scratch60/plink-clumping/samples_qctool.txt \
    --rawPheno2 /gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis/UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120 \
    --rawPheno1 /gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis/Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720 \
    --phenoFile /home/dc2325/scratch60/plink-clumping/MWE/phenotypes.txt \
    --unrelated_samples /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620\
    --numThreads 20 \
    --job_size 1 \
    -c ~/project/UKBB_GWAS_dev/farnam.yml -q farnam -J 40 \
    &> sos-submission-MWE-061020.log

```

### Running example for plink files (e.g exome data)

```
sos run ~/project/UKBB_GWAS_dev/workflow/MWE.ipynb plink_mwe \
    --cwd ~/scratch60/plink-clumping/MWE \
    --bfile /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated082020removedwithdrawnindiv.bed \
    --genoFile `echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{21..22}_b0_v1.bed` \
    --sampleFile /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb39554_imputeddataset/ukb32285_imputedindiv.sample \
    --samplesPlink /gpfs/gibbs/pi/dewan/data/UKBiobank/MWE/burden/phenotype_burden_IID.txt \
    --phenoFile /gpfs/gibbs/pi/dewan/data/UKBiobank/MWE/burden/phenotype_burden.txt \
    --unrelated_samples /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620 \
    -s build 
```


In [None]:
[global]
# Working directory: change accordingly
parameter: cwd = path
# Genotype file in plink binary format for genotype array 
parameter: bfile = path
# Path to bgen/exome files
parameter: genoFile = paths
# Path to sample file for bgen files
parameter: sampleFile = path
# Samples to select for plink format contains two columns FID and IID
parameter: samplesPlink = path
# Phenotype file for both quantitative (BMI) and qualitative (asthma) traits
parameter: phenoFile = path
# Unrelated samples from UKB
parameter: unrelated_samples = path
# Specify the nymber of threads to use
parameter: numThreads = 20
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Load specific modules
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''
parameter: qctool_module = '''
module load QCTOOL/2.0-foss-2016b-rc7-CentOS6.8
echo "Module qctool loaded"
{cmd}
'''
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module plink2 loaded"
{cmd}
'''

In [None]:
# Extract the 100 individuals from the phenotypic files
[phenotypes]
# Raw phenotype to extract individuals
parameter: rawPheno2 = path
# Raw phenotype to extract individuals
parameter: rawPheno1 = path
# Samples to select for awk contains only one column IID
parameter: samplesID = path 
input: rawPheno1, rawPheno2, unrelated_samples
output: f'{cwd}/phenotype_100samples.txt', f'{cwd}/samplesID.txt', f'{cwd}/samples_qctool.txt', f'{cwd}/samples_plink.txt'
bash: expand= "${ }", workdir = cwd
    awk 'FNR==NR{a[$1];next}($1 in a){print}' ${_input[0]} ${_input[1]} > common_IDs.txt
    cat common_IDs.txt | awk 'NR==1; $5==1 {print}' | head -n 51 > 50_cases.txt
    cat common_IDs.txt | awk 'NR==1; $5==0 {print}' | head -n 51 > 50_controls.txt
    awk 'FNR>1 || NR==1' 50_* > ${_output[0]}
    awk '{print $2}' ${_output[0]} | sort -k 1n > ${_output[1]}
    cat ${_output[1]} | awk -F " " 'NR>1 {print}; {ORS= " "}' > ${_output[2]}
    awk '{print $1,$2}' ${_output[0]} > ${_output[3]}
    grep -w -F -f ${_output[1]} ${_input[2]}  > unrelated_samplesID.txt
    rm 50_* common_IDs.txt

In [None]:
# Create .bed, .fam and .bim files
[plink]
input: bfile, samplesPlink
output: f'{cwd}/{_input[0]:b}.MWE_data.bed'
bash: expand= "${ }", workdir = cwd, template = plink_module
    plink \
    --bfile ${_input[0]} \
    --keep ${_input[1]} \
    --make-bed \
    --out ${_output:bn}

In [None]:
#To filter snps with info score above 0.8 
[snps]
# Select snps for each chromosome 
parameter: select_snps= paths([f'/gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb39554_imputeddataset/ukb_mfi_chr{x+20}_v3.txt_infoabove0.8' for x in range (2)])
input: for_each='select_snps'
output: f'{cwd}/{_input:bn}.snps'
bash: expand="${ }", workdir=cwd
    awk '$2 ~ /rs/ { print $2 }' ${_input} | tail -n 50 > ${_output}

In [None]:
# Filter bgen files chr21 and chr22 with only the 100 individuals
# Cluster script mwe_chr2.sh
[qctool_1]
# Samples to select for qctool white-space delimited list of IID
parameter: samplesQctool = path
# Output the bgen file with 8bit formatting
parameter: bgen_bits=8
# Filter snps in bgenFiles
parameter: rsid = paths
input: genoFile, paired_with='rsid', group_by=1
output: f'{cwd}/{_input:bn}.filtered.bgen'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", workdir = cwd, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = qctool_module
    qctool \
    -g ${_input} \
    -s ${sampleFile} \
    -og ${_output} \
    -os ${_output:n}.sample \
    -incl-samples ${samplesQctool} \
    -incl-rsids ${_input._rsid} \
    -bgen-bits ${bgen_bits} \
    -threads ${numThreads}

In [None]:
[qctool_2]
input: output_from=('qctool')
output:f'{cwd}/{_input:bn}.filtered.bgen.bgi'
bash: expand= "${ }", workdir = cwd, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bgenix \
    -g ${_input} \
    -index

In [None]:
# Filter bgen files chr21 and chr22 with only the 100 individuals and 8 bytes format
[qctool_3]
input: bgenFile, paired_with= 'rsid', group_by=1
output: f'{cwd}/{_input:bn}.plink.filtered.bgen'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", workdir = cwd, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = plink2_module
    plink2 \
      --bgen ${_input} ref-first \
      --sample ${sampleFile} \
      --keep ${samplesPlink} \
      --extract ${_input._rsid:n} \
      --export bgen-1.2 "bits=8" \
      --threads ${numThreads} \
      --out ${_output}

## Generate MWE for plink exome data 

In [None]:
# Filter the exome data for chr21 and chr22
[plink_mwe_1]
input: genoFile, group_by=1
output: f'{cwd}/{_input:bn}.plink.exome.filtered.bed'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = plink_module
    plink \
      --bfile ${_input:n}  \
      --keep ${samplesPlink} \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n}

In [None]:
# Filter the genotype array for chr21 and chr22
[plink_mwe_2]
input: bfile
output: f'{cwd}/genotypes_21_plink.exome.bed', f'{cwd}/genotypes_22_plink.exome.bed'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template = plink_module
    plink \
      --bfile ${_input:n}  \
      --keep ${samplesPlink} \
      --chr 21 \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output[0]:n}
      
    plink \
      --bfile ${_input:n}  \
      --keep ${samplesPlink} \
      --chr 22 \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output[1]:n}

In [None]:
# Merge the genotype files for chr21 and chr22
[plink_mwe_3]
output: f'{cwd}/genotypes_21_22_plink.exome.bed'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = plink_module
    plink \
      --bfile ${_input[0]:n}  \
      --bmerge ${_input[1]} ${_input[1]:n}.bim ${_input[1]:n}.fam \
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n}

```
cat /gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis/Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720 \
| awk 'NR==1; $5==1 {print}' | head -n 51 > asthma_cases.txt
cat /gpfs/gibbs/pi/dewan/data/UKBiobank/phenotype_files/pleiotropy_R01/phenotypesforanalysis/Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720 \
| awk 'NR==1; $5==0 {print}' | head -n 51 > asthma_controls.txt

#This was to see which ID's were different between asthma and BMI
diff -y <( awk '{print $1}' samples_asthma_sorted.txt) <( awk '{print $1}' BMI_samples_sorted.txt )

# Look for common IDs in both files
# One way to do it with awk
awk 'FNR==NR{a[$1];next}($1 in a){print}' Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720 UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120 > common_IDs.txt

# Another way to do it but files need to be sorted firt
sort -k 2n Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720
sort -k 2n UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120
comm -12  <( awk '{print $1}' Asthma_casesbyICD10codesANDselfreport_controlsbyselfreportandicd10_noautoimmuneincontrols_forbolt030720) \
<( awk '{print $1}' UKB_caucasians_BMIwaisthip_AsthmaAndT2D_withagesex_033120) > common_IDs.txt

#Now select 50 cases and 50 controls based on asthma 
cat common_IDs.txt | awk 'NR==1; $5==1 {print}' | head -n 51 > asthma_cases.txt
cat common_IDs.txt | awk 'NR==1; $5==0 {print}' | head -n 51 > asthma_controls.txt

#Join the cases and controls in one file 
awk 'FNR>1 || NR==1' asthma_c* > asthma_samples.txt

# Create the samplesID file to be used 
awk '{print $2}' asthma_samples.txt | sort -k 1n > samplesID.txt #only 84 are unrelated

# See if the samples are unrelated
grep -w -F -f samplesID.txt /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620 > unrelated_samplesID.txt

#Create the samples file for qctools
cat samplesID.txt | awk -F " " 'NR>1 {print}; {ORS= " "}' > samples_qctool.txt

#Create the sample file for plink
awk '{print $1,$2}' asthma_samples.txt > samples_plink.txt

#Select column with awk with partial matches

awk '$2 ~ /rs/ { print $2 }' /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb39554_imputeddataset/ukb_mfi_chr21_v3.txt_infoabove0.8 | tail -n 50 > ~/scratch60/plink-clumping/chr21_snps.txt

cat chr21_snps.txt | awk 'BEGIN {ORS=" "}; $2 ~ /rs/ { print $2 }' > chr21_filter_snps.txt
cat chr22_snps.txt | awk 'BEGIN {ORS=" "}; $2 ~ /rs/ { print $2 }' > chr22_filter_snps.txt

#Extract the 1000 unrelated samples from phenotype file
head -n 1 /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620 > phenotype_1000ind.txt

grep -w -F -f  UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620.1000unrelatedindiv.sample   > unrelated_samplesID.txt /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/pleiotropy_geneticfiles/unrelated_n307259/UKB_unrelatedcauc_phenotypes_asthmat2dbmiwaisthip_agesex_waisthipratio_040620 >> phenotype_1000ind.txt
```