# Extract regions of a chromosome using qctool

In [None]:
[global]
# Work directory where output will be saved to
parameter: cwd = path
# Region specifications
parameter: region_file = path
# Path to bgen/exome files
parameter: genoFile = path
# Number of tasks to run in each job on cluster
parameter: job_size = 1
# Number of threads
parameter: numThreads = 8
# Specify the container to use
#parameter: container_lmm = 'statisticalgenetics/lmm:3.0'
parameter: qctool_module = '''
module load Qctool/2.0.8
echo "Module qctool loaded"
{cmd}
'''
fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')
# Load all regions of interest. Each item in the list will be a region: (chr, start, end)
regions = list(set([tuple(x.strip().split()) for x in open(region_file).readlines() if x.strip()]))


In [None]:
[qctool_1]
# Output the bgen file with 8bit formatting
parameter: bgen_bits=8
parameter: genoFile = path
input: for_each=dict(regions=regions)
import os 
filelist=os.listdir(f"{genoFile}") 
for file in filelist:
    if "ukb_imp_chr"+regions[0] in file:
        geno = os.path.join(f"{genoFile}", file)
        name = file
output: f'{cwd}/{name}_{regions[0]}_{regions[1]}_{regions[2]}.bgen'
task: trunk_workers = 1, trunk_size = job_size, cores = numThreads, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", workdir = cwd, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = qctool_module
    qctool \
    -g ${geno} \
    -og ${_output} \
    -incl-range ${regions[0]}:${regions[1]}-${regions[2]} \
    -bgen-bits ${bgen_bits} \
    -threads ${numThreads}

In [None]:
[qctool_2]
output:f'{cwd}/{_input:bn}.bgi'
bash: expand= "${ }", workdir = cwd, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = qctool_module
    bgenix \
    -g ${_input} \
    -index

In [None]:
cwd=~/test/region_extract
region='05:272856-1213643'
namereg='chr5_272856_1213643 '
sos_qctool=~/project/UKBB_GWAS_dev/workflow/qctool_extract_region.ipynb
genoFile=/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen
extract_sbatch=~/hearing/region_extraction/

extract_args="""  qctool \
    --cwd $cwd 
    --region $region
    --namereg $namereg
    --genoFile $genoFile
"""
sos run ~/project/UKBB_GWAS_dev/admin/Get_Job_Script.ipynb csg \
    --template-file $tpl_file \
    --workflow-file $sos_qctool \
    --to-script $extract_sbatch \
    --args "$extract_args"

In [None]:
sos dryrun ~/project/UKBB_GWAS_dev/workflow/qctool_extract_region.ipynb qctool\
    --cwd ~/test/ \
    --region_file ~/test/region_extract/test_chr5 \
    --genoFile /mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/

In [None]:
 sos dryrun ~/project/UKBB_GWAS_dev/workflow/qctool_extract_region.ipynb qctool:1    \
        --cwd ~/test/     \
        --region_file ~/test/region_extract/test_chr5     \
        --genoFile /mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/