# Filter specific individuals and SNPs from the exome data

## How to run this notebook

```
sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb extract_region\
    --cwd ~/test \
    --bedfiles ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c5.merged.filtered.bed\
    --region 5_272741_1213528 \
    --chrom 5\
    --start_pos 272741\
    --end_pos 1213528\
    --job_size 1 \
    --numThreads 2 \
    --container_lmm ~/containers/lmm.sif
```

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Load Plink module from cluster
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module PLINK2 loaded"
{cmd}
'''
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''

# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:2.9'
parameter: container_marp = 'gaow/marp'

In [None]:
#select individuals and filter specific snps
[extract_snps]
# snps to extract, you need to provide a file with a range per line e.g. 2 30000000 35000000  R1
parameter: snp_list = path
# samples to extract, you need to provide a file with FID and IID columns of individuals to keep
parameter: sample_list = path
input: bedfiles
output: f'{cwd}/{_input:bn}.extract.raw'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
      --bfile ${_input:n}  \
      --keep ${sample_list} \
      --extract ${snp_list} --range \
      --recodeA \
      --threads ${numThreads} \
      --out ${_output:n} 

In [None]:
[extract_region]
parameter: region = str
parameter: chrom = int
parameter: start_pos = int
parameter: end_pos = int
input: bedfiles
output: f'{cwd}/{_input:bn}.{region}.bed'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
      --bfile ${_input:n}  \
      --chr ${chrom} \
      --from-bp ${start_pos}\
      --to-bp ${end_pos}\
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n} 