# Sample and variant missigness statistics

Understand patterns of missing data PLINK exome files UKBB

## Aim

To generate missingness statistics for exome data of the UKBB and create a pptx/html report in order to decide the filtering parameters that will be used in other downstream analyses.

FIXME: the report created using marp needs to be organized to fit in the graphs within one slide and to allow better visualization of the tables

## Running the pipeline

In the bash terminal, set the variables and then run:

```
sos run ~/project/UKBB_GWAS_dev/plink_missing.ipynb \
    --cwd $cwd \
    --bedfiles $bedfiles \
    --bimfiles $bimfiles \
    --famFile $famFile \
    --numThreads $numThreads \
    --job_size $job_size \
    --container_lmm $container_lmm \
    --container_marp $container_marp
```

# On Yale's cluster

```
UKBB_PATH=/gpfs/gibbs/pi/dewan/data/UKBiobank
tpl_file=../farnam.yml
pca_dir=$UKBB_PATH/results/pca_exomes
famFile=$UKBB_PATH/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_s200631.fam
bedfiles=`echo $UKBB_PATH/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
bimfiles=`echo $UKBB_PATH/genotype_files/ukb28374_exomedata/exome_data_OCT2020/UKBexomeOQFE_chr{1..22}.bim`
# Container
container_lmm=$UKBB_PATH/lmm.sif
container_marp=$UKBB_PATH/marp.sif
# Pipeline
plink_sos=~/project/UKBB_GWAS_dev/workflow/plink_missing.ipynb
# Name of bash script
plink_sbatch=../output/$(date +"%Y-%m-%d")_plink_miss.sbatch
numThreads=1
job_size=1

plink_args="""missing \
    --cwd $pca_dir \
    --bedfiles $bedfiles \
    --bimfiles $bimfiles \
    --famFile $famFile \
    --numThreads $numThreads \
    --job_size $job_size \
    --container_lmm $container_lmm \
    --container_marp $container_marp \
    -s build
"""

sos run ~/project/bioworkflows/GWAS/Get_Job_Script.ipynb farnam \
    --template-file $tpl_file \
    --workflow-file $plink_sos \
    --to-script $plink_sbatch \
    --args "$plink_args"
```

## Command interface

In [None]:
sos run plink_missing.ipynb -h

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: genoFile = paths
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.8'
parameter: container_marp = 'gaow/marp'

In [None]:
# Genotype and sample missingness for exome files
[missing_1]
input: genoFile, group_by=1
output: f'{cwd}/{_input:bn}.imiss', f'{cwd}/{_input:bn}.lmiss', f'{cwd}/{_input:bn}.frq', f'{cwd}/{_input:bn}.log'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    plink \
      --bfile ${_input:n} \
      --missing \
      --freq \
      --threads ${numThreads} \
      --out ${_output[0]:n} 

In [None]:
# Generate plots to understand patterns of missing data
[missing_2]
# Filter out variants with missing call rate higher that this value
parameter: geno = 0.01
# Filter out samples with missing rate higher than this value
parameter: mind = 0.02
# Filter based on minor allele frequency
parameter: maf = 0.01
sep = '\n\n---\n'
input: group_by=4
output: analysis_summary = f'{cwd}/{_input[0]:bn}.missing.md', 
        sample_plot = f'{cwd}/{_input[0]:bn}.imiss.png',  
        variant_plot = f'{cwd}/{_input[0]:bn}.lmiss.png',  
        maf_plot = f'{cwd}/{_input[0]:bn}.maf.png'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'   
R: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'
    #load libraries
    library(ggplot2)
    library(scales)
    #read data sample missingness
    smiss <- read.table(${_input[0]:r}, header=T)
    #colnames(smiss) <- c("FID","IID","MISS_PHENO", "N_MISS", "N_GENO","F_MISS")
    out = capture.output(as.array(summary(smiss$F_MISS)))
    smiss_filter = length(which(smiss$F_MISS>${mind}))
    #read data variant missingness
    vmiss <- read.table(${_input[1]:r}, header=T)
    #colnames(vmiss) <- c("CHR","SNP","N_MISS", "N_GENO", "F_MISS")
    out_1 = capture.output(as.array(summary(vmiss$F_MISS)))
    vmiss_filter = length(which(vmiss$F_MISS>${geno}))
    #look at the allele frequency
    afreq <- read.table(${_input[2]:r}, header=T)
    #colnames(afreq) <- c("CHR","SNP","A1", "A2", "MAF", "NCHROBS")
    out_2 = capture.output(as.array(summary(afreq$MAF)))
    afreq_filter = length(which(afreq$MAF>${maf}))
    # create plots
    png('${_output[1]}', width = 4, height = 4, unit='in', res=300)
    smiss_plot <- ggplot(smiss, aes(x=F_MISS)) + geom_histogram(aes(y = stat(count) / sum(count)),binwidth=0.001)  + labs(title="Histogram of sample missing call rate\n for ${_input[0]:bn}", x ="Proportion of missing variants", y = "Frequency") + scale_y_continuous(breaks=seq(0,1,by=0.2), labels = scales::percent) + theme(plot.title = element_text(size = 10)) + theme_classic()
    print(smiss_plot)
    dev.off()
    
    png('${_output[2]}', width = 4, height = 4, unit='in', res=300)
    vmiss_plot <- ggplot(vmiss, aes(x=F_MISS)) + geom_histogram(aes(y = stat(count) / sum(count)), bins=100) + labs(title="Plot of variant missingness rate\n for ${_input[0]:bn}", x ="SNP missing call rate", y = "Frequency") + scale_y_continuous(breaks=seq(0,1,by=0.2), labels = scales::percent) + theme(plot.title = element_text(size = 10)) + theme_classic()
    print(vmiss_plot)
    dev.off()
  
    png('${_output[3]}', width = 4, height = 4, unit='in', res=300)
    freq_plot <- ggplot(afreq, aes(MAF)) + geom_histogram(bins=100) + xlim(c(0,0.5))  + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + labs(title="Minor allele frequency distribution\n for ${_input[0]:bn}", x ="MAF", y = "Number of SNPs") + theme(plot.title = element_text(size = 10)) + theme_classic()
    print(freq_plot)
    dev.off()
  
    write('# ${_input[0]:b} result summary\n## Sample missingness summary:\n```', ${_output[0]:r}, append = T)
    cat('##The number of samples with a proportion of missing SNPs higher than ${mind} is:',smiss_filter,".\n", file=${_output[0]:r}, append = T ) 
    write.table(out, ${_output[0]:r}, append = T) 
    cat('##The number of variants missing in more than ${geno} of the samples is:',vmiss_filter,".\n", file=${_output[0]:r}, append = T )
    write.table(out_1, ${_output[0]:r}, append = T) 
    cat('##The number of variants with maf>${maf} is:',afreq_filter,".\n", file=${_output[0]:r}, append = T )
    write.table(out_2, ${_output[0]:r}, append = T)
    write("```${sep}", ${_output[0]:r}, append = T)
  
bash: expand=True
  set -e
  echo -e "### Number of analyzed variants for {_output[0]:bn}\n" >> {_output[0]}
  cat  {_output[0]:nn}.log | grep -w "variants" && echo -e "{sep}" >> {_output[0]}
  echo -e "#\n#\n### Sample missingness rate plot for {_output[1]:bn}\n" >> {_output[0]}
  echo -e "![]({_output[1]:b}){sep}" >> {_output[0]}
  echo -e "#\n#\n### Variant missingess rate plot for {_output[2]:bn}\n" >> {_output[0]}
  echo -e "![]({_output[2]:b}){sep}" >> {_output[0]}
  echo -e "### Minor allele frequency plot for {_output[3]:bn}\n" >> {_output[0]}
  echo -e "![]({_output[3]:b}){sep}" >> {_output[0]}
  echo -e "### Result files\n\`\`\`" >> {_output[0]}
  ls {_output[0]:nn}.* | grep -vP 'stderr|stdout'>> {_output[0]}
  echo -e "\`\`\`" >> {_output[0]}

In [35]:
# Generate the merged markdown file
[missing_3]
output: f'{cwd}/{str(_input[0]).rsplit('_',3)[0]}.merged.md'
bash:expand = "${ }"
    echo '''---
    theme: base-theme
    style: |
     img {
       #display: block;
       height: 60%;
       width: 60%;
       #margin-top: 22%;
    }
    ---    
    ''' >  ${_output}
    
  for i in $(ls -rt ${_input[0]:nn}.* | grep "missing.md");do cat $i >> ${_output};done

In [1]:
# Generate analysis report: HTML file, and optionally PPTX file
[missing_4]
output: f'{_input:n}.html'
sh: container=container_marp, expand = True, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    node /opt/marp/.cli/marp-cli.js {_output:n}.md -o {_output:a} \
        --title 'Sample and variant missingness UKBB {_output:bn}' \
        --allow-local-files
    node /opt/marp/.cli/marp-cli.js {_output:n}.md -o {_output:an}.pptx \
        --title 'Sample and variant missingness UKBB {_output:bn}' \
        --allow-local-files 