# Select 10000 individuals white British from UKBB to create reference panel

In [None]:
[global]
# Working directory: change accordingly
parameter: cwd = path
# The fam file to extract the individuals
parameter: famFile = path
# Path to bgen files
parameter: bgenFile = paths
# Path to sample file
parameter: sampleFile = path
# For cluster jobs, number commands to run per job
parameter: job_size = int
# Specific number of threads to use
parameter: numThreads = int
# Load specific modules for each step
parameter: qctool_module = '''
module load Qctool/2.0.8
echo "Module qctool loaded"
{cmd}
'''
# Load specific modules for each step
parameter: bgenix_module = '''
module load Bgenix/1.1.8
echo "Module qctool loaded"
{cmd}
'''
# The container with the lmm software. Can be either a dockerhub image or a singularity `sif` file.
parameter: container_lmm = 'statisticalgenetics/lmm:2.9'

In [None]:
#Use the most recent genotype file to select 10,000 individuals at random
[filter_samples_1: provides = [f'{cwd}/{famFile:bn}.10000ind_ref']]
input: famFile
output: f'{cwd}/{_input:bn}.10000ind_ref'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h',  mem = '10G', tags = f'{step_name}_{_output:bn}'
R: container=container_lmm, expand="${ }", stderr=f'{_output:n}.stderr', stdout=f'{_output:n}.stdout'
  library(dplyr)  
  sample = read.table(${famFile:r}, header=F)
  ref = sample %>%
    sample_n(10000) %>%
    select("V1")
  ref = t(ref)
  write.table(ref, ${_output:r}, col.name = F, row.names = F)

In [None]:
# Filter samples from bgen files
[filter_samples_2]
input: bgenFile, group_by=1
output: f'{cwd}/{_input:bn}.10000ind_ref.bim', f'{cwd}/{_input:bn}.10000ind_ref.sample'
depends: f'{cwd}/{famFile:bn}.10000ind_ref'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h',  mem = '10G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template = '{cmd}' if executable('qctools').target_exists() else qctool_module
    qctool \
    -g ${_input} \
    -s ${sampleFile} \
    -incl-samples ${_depends} \
    -og ${_output[0]} \
    -os ${_output[1]}

In [None]:
[filter_variants]
# Give the region to extract in the format chr:start-end
parameter: region = str
input: bgenFile, group_by=1
output: f'{cwd}/{_input:bn}.{region}.bgen'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h',  mem = '10G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('qctool').target_exists() else qctool_module
    qctool \
    -g ${_input} \
    -s ${sampleFile} \
    -incl-range ${region} \
    -og ${_output}

In [None]:
# Filter a region from a bgen file using bgenix (the file needs to be indexed first)
# Make sure the correct format of the chromosome chr5, 5, 05 if not the command will fail to produce output
[filter_region_bgenix]
parameter: region = str
input: bgenFile, group_by=1
output: f'{cwd}/{_input:bn}.{region}.bgen'
task: trunk_workers = 1, trunk_size = job_size, walltime = '48h',  mem = '10G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('bgenix').target_exists() else bgenix_module
   bgenix \
    -g ${_input} \
    -incl-range ${region} > ${_output}

In [None]:
sos run ~/project/UKBB_GWAS_dev/workflow/Reference_sampleUKBB.ipynb filter_variants\
    --cwd ~/test \
    --famFile \
    --sampleFile ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample\
    --bgenFile ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen\
    --region 5:272856-1213643 \
    --job_size 1 \
    --numThreads 2

In [None]:
qctool -g ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen\
       -s ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample\
       -incl-range 05:272856-1213643 \
       -assume-chromosome 05 \
       -og ~/ukb_imp_chr5_v3_subset.bgen

## Using bgenix

In [None]:
bgenix -g ~/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen -incl-range 05:272856-1213643 > ~/test/chr5_test.bgen