In [1]:
[global]
# Working directory: change accordingly
parameter: cwd = path
# Genotype file in plink binary format
parameter: bfile = path
# Path to bgen files
parameter: bgenFile = paths
# Path to sample file
parameter: sampleFile = path
# Path to summary stats file
parameter: sumstatsFiles = paths
# Path to samples of unrelated individuals
parameter: unrelated_samples = path
# Number of samples to use to compute LD
parameter: ld_sample_size = 1000
# Clumping parameteres
parameter: clump_field = str
parameter: clump_annotate = str
parameter: clump_p1 = 5e-08
parameter: clump_p2 = 1
# r2 = 0.04 => r = 0.2
parameter: clump_r2 = 0.04
parameter: clump_kb = 2000
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Output the bgen file with 8bit formatting
#parameter: bgen_bits=16
# Specific number of threads to use
parameter: numThreads = int
# Load specific modules for each step
parameter: qctool_module = '''
module load QCTOOL/2.0-foss-2016b-rc7-CentOS6.8
echo "Module qctool loaded"
{cmd}
'''
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module plink2 loaded"
{cmd}
'''

parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''

In [2]:
sos run LD_clump_patch.ipynb -h

usage: sos run LD_clump_patch.ipynb [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  default

Global Workflow Options:
  --cwd VAL (as path, required)
                        Working directory: change accordingly
  --bfile VAL (as path, required)
                        Genotype file in plink binary format
  --bgenFile  paths

                        Path to bgen files
  --sampleFile VAL (as path, required)
                        Path to sample file
  --sumstatsFiles  paths

                        Path to summary stats file
  --unrelated-samples VAL (as path, required)
                        Path to samples of unrelated individuals
  --ld-sample-size 1000 (as int)
                  

In [None]:
# Select a subset of samples from the BGEN files
[default_1]
depends: f'{cwd}/{unrelated_samples:bn}.{ld_sample_size}.txt'
input: bgenFile, group_by=1
output: f'{cwd}/{_input:bn}.{ld_sample_size}.bgen', f'{cwd}/{_input:bn}.{ld_sample_size}.sample'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h',  mem = '12G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template = '{cmd}' if executable('qctools').target_exists() else qctool_module
    qctool \
    -g ${_input} \
    -s ${sampleFile} \
    -og ${_output[0]} \
    -os ${_output[1]} \
    -incl-samples ${_depends}

In [None]:
# Make the binary files for the selected samples 
[default_2]
output: f'{_input[0]:n}.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '12G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink2').target_exists() else plink2_module
    plink2 \
    --bgen ${_input[0]} ref-first \
    --sample ${_input[1]} \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads}

In [None]:
# Remove multiallelic variants and rename indels 
[default_3]
depends: Py_Module('xxhash')
output: f'{cwd}/{_input[0]:bn}.filtered.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '12G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    cut  -d$'\t' -f2 ${_input[0]:n}.bim | sort | uniq -d > ${_output:nn}.exclude
    plink2 \
    --bfile ${_input[0]:n} \
    --exclude ${_output:nn}.exclude \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads} \
    --memory 12000
    
python: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    # Fix SNP names longer than 50 characters. 
    # This will result in a false insufficient memory alert and error in the next step, if not dealt with
    import pandas as pd
    from xxhash import xxh32 as xxh
    def shorten_id(x):
        return x if len(x) < 30 else f"{x.split('_')[0]}_{xxh(x).hexdigest()}"

    dat = pd.read_csv('${_output:bn}.bim', header=None, sep='\t')
    dat.columns = ['chrom', 'id', 'gd', 'pos', 'a1', 'a2']
    dat['id'] = dat['id'].apply(shorten_id)
    dat.to_csv('${_output:n}.bim', sep='\t', header=False, index=False)

In [None]:
# Merge all the .bed files into one reference file 
[default_4]
input: group_by = 'all'
output: f'{cwd}/{bfile:bn}.ref_geno.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '64G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    echo -e ${' '.join([str(x)[:-4] for x in _input[1:]])} | sed 's/ /\n/g' > ${_output:n}.merge_list
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output:n}.merge_list \
    --make-bed \
    --out ${_output:n} \
    --threads ${numThreads} \
    --memory 64000 

In [None]:
# Perform LD-clumping in PLINKv1.9
[default_5]
output: f'{_input:nn}.clumped', f'{_input:nn}.clumped_region'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '12G',cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module    
    plink \
    --bfile ${_input:n} \
    --clump ${sumstatsFiles:,} \
    --clump-field ${clump_field} \
    --clump-p1 ${clump_p1} \
    --clump-p2 ${clump_p2} \
    --clump-r2 ${clump_r2} \
    --clump-kb ${clump_kb} \
    --clump-verbose \
    --clump-annotate ${clump_annotate} \
    --clump-allow-overlap \
    --out ${_output[0]:n} \
    --threads ${numThreads} \
    && touch ${_output[0]} # need to touch and create empty file because some chroms may not have anything significant to clump.
    grep "RANGE" ${_output[0]} | awk -F ":" '{print $2, $3}' | sort -V | sed 's/\../ /g; s/^[[:blank:]]*//g' > ${_output[1]}