## Missigness patterns UKBB

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = paths
# BIM Plink files for exome data
parameter: bimfiles = paths
# The fam file associated to the bed files
parameter: famFile = path 
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Load Plink module from cluster
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module PLINK2 loaded"
{cmd}
'''
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''
# Load Eigensoft module from cluster
parameter: eigensoft_module = '''
module load EIGENSOFT/7.2.1-foss-2018b
echo "Module Eigensoft v.7.2.1 loaded"
{cmd}
'''
# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.4'
parameter: container_marp = 'gaow/marp'

In [None]:
# Filter SNPs with MAF>1% for PCA analysis, select individuals and merge bed into one file
[miss]
input: bedfiles, paired_with=['bimfiles'], group_by=1
output: f'{cwd}/{_input:bn}.smiss', f'{cwd}/{_input:bn}.vmiss', f'{cwd}/{_input:bn}.afreq', f'{cwd}/{_input:bn}.log'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template = '{cmd}' if executable('plink2').target_exists() else plink2_module
    plink2 \
      --bed ${_input}  --bim ${_input._bimfiles} --fam ${famFile} \
      --missing \
      --freq \
      --threads ${numThreads} \
      --out ${_output[0]:n} 

In [None]:
# Understand patterns of missing data
[qc_1]
# Filter out variants with missing call rate higher that this value
parameter: geno = 0.01
# Fitler out samples with missing rate higher than this value
parameter: mind = 0.02
# Filter based on minor allele frequency
parameter: maf = 0.01
sep = '\n\n---\n'
input: output_from('miss'),group_by=4
output: analysis_summary = f'{cwd}/{_input[0]:bn}.missing.md', 
        sample_plot = f'{cwd}/{_input[0]:bn}.smiss.png',  
        variant_plot = f'{cwd}/{_input[0]:bn}.vmiss.png',  
        maf_plot = f'{cwd}/{_input[0]:bn}.maf.png'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
      img {
        height: 80%;
        display: block;
        margin-left: auto;
        margin-right: auto;
      }
    ---    
    ''' > ${_output[0]}
    
R:expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    #load libraries
    library(ggplot2)
    library(scales)
    #read data sample missingness
    smiss <- read.table(${_input[0]:r}, sep='\t', header=F)
    colnames(smiss) <- c("FID","IID","MISSING_CT", "OBS_CT", "F_MISS")
    out = capture.output(as.array(summary(smiss$F_MISS)))
    smiss_filter = length(which(smiss$F_MISS>${mind}))
    #read data variant missingness
    vmiss <- read.table(${_input[1]:r}, sep='\t', header=F)
    colnames(vmiss) <- c("CHROM","ID","MISSING_CT", "OBS_CT", "F_MISS")
    out_1 = capture.output(as.array(summary(vmiss$F_MISS)))
    vmiss_filter = length(which(vmiss$F_MISS>${geno}))
    #look at the allele frequency
    afreq <- read.table(${_input[2]:r}, sep='\t', header=F)
    colnames(afreq) <- c("CHROM","ID","REF", "ALT", "ALT_FREQS", "OBS_CT")
    out_2 = capture.output(as.array(summary(afreq$ALT_FREQS)))
    afreq_filter = length(which(afreq$ALT_FREQS>${maf}))
    # create plots
    png('${_output[1]}', width = 4, height = 4, unit='in', res=300)
    smiss_plot <- ggplot(smiss, aes(x=F_MISS)) + geom_histogram(binwidth=0.0001)  + labs(title="Plot of sample missingness rate ${_input[0]:bn}", x ="Proportion of missing data", y = "Number of samples") + theme_classic()
    print(smiss_plot)
    dev.off()
    
    png('${_output[2]}', width = 4, height = 4, unit='in', res=300)
    vmiss_plot <- ggplot(vmiss, aes(x=F_MISS)) + geom_histogram(bins=100) + labs(title="Plot of variant missingness rate for ${_input[0]:bn}", x ="Proportion of missing data", y = "Number of SNPs") + theme_classic()
    print(smiss_plot)
    dev.off()
  
    png('${_output[3]}', width = 4, height = 4, unit='in', res=300)
    freq_plot <- ggplot(afreq, aes(ALT_FREQS)) + geom_histogram(bins=200) + xlim(c(0,0.5))  + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + labs(title="Minor allele frequency distribution for ${_input[0]:bn}", x ="MAF", y = "Number of SNPs") + theme_classic()
    print(freq_plot)
    dev.off()
  
    write('# ${_input[0]:b} result summary\n## Sample missingness summary:\n```', ${_output[0]:r}, append = T)
    cat('The number of samples with a proportion of missing SNPs higher than ${mind} is:',smiss_filter,".\n", file=${_output[0]:r}, append = T )
    write.table(out, ${_output[0]:r}, append = T)
    cat('The number of variants missing in more than ${geno} of the samples is:',vmiss_filter,".\n", file=${_output[0]:r}, append = T )
    write.table(out_1, ${_output[0]:r}, append = T)
    cat('The number of variants with maf>${maf} is:',afreq_filter,".\n", file=${_output[0]:r}, append = T )
    write.table(out_2, ${_output[0]:r}, append = T)
    write("```", ${_output[0]:r}, append = T)

In [35]:
[qc_2]
sep = '\n\n---\n'
input: output_from('qc_1'),group_by=4
bash: expand=True
  set -e
  echo -e "# Number of analyzed variants for {_input[0]:bn}\n" >> {_input[0]}
  cat  {_input[0]:nn}.log | grep -w "variants"{sep} >> {_input[0]}
  echo -e "# Sample missingness rate plot for {_input[1]:bn}\n" >> {_input[0]}
  echo -e "![]({_input[1]:b}){sep}" >> {_input[0]}
  echo -e "# Variant missingess rate plot for {_input[2]:bn}\n" >> {_input[0]}
  echo -e "![]({_input[2]:b}){sep}" >> {_input[0]}
  echo -e "# Minor allele frequency plot for {_input[3]:bn}\n" >> {_input[0]}
  echo -e "![]({_input[3]:b}){sep}" >> {_input[0]}
  echo -e "# Result files\n\`\`\`" >> {_input[0]}
  ls {_input[0]:nn}.* | grep -vP 'stderr|stdout'>> {_input[0]}
  echo -e "\`\`\`" >> {_input[0]}
bash:expand=True
  touch {str(_input[0]).rsplit('_',3)[0]}.merged.md && for i in $(ls -rt {_input[0]:nn}.* | grep "missing.md");do cat $i >> {str(_input[0]).rsplit('_',3)[0]}.merged.md;done

In [1]:
# Generate analysis report: HTML file, and optionally PPTX file
[qc_3]
parameter: ukbb_field = 'ukb23155'
input: f'{cwd}/{ukbb_field}.merged.md'
output: f'{cwd}/{_input:bn}.html'
sh: container=container_marp, expand = True, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    node /opt/marp/.cli/marp-cli.js {_input} -o {_output:a} \
        --title 'Sample and variant missingness UKBB {_input:bn}' \
        --allow-local-files
    node /opt/marp/.cli/marp-cli.js {_input} -o {_output:an}.pptx \
        --title 'Sample and variant missingness UKBB {_input:bn}' \
        --allow-local-files 