# Design qmap file to extract variants corresponding to CH genes from 50k UKB WES gVCF 

### Objective

Design qmap file


### PROCESS

1. Get a list of gVCF file names from the 50k UKB gvcf dataset.


2. Create the job instructions for the 50k qmap jobs, setting the files and folders needed and then creating list of commands.


3. Set qmap file with parameters and jobs and save file.


### OUTPUT

Folder: /workspace/datasets/ukbiobank_ch/analysis_50k_202103/

**qmap file: extract_chgenes_ukb_test25.qmap**


In [2]:
import pandas as pd
import os

In [8]:
### Get a list of gVCF file names from the 50k UKB gvcf dataset

# List files in directory
files = os.listdir('/workspace/datasets/ukbiobank_ch/vcf_50k')

# Filter gvcf files
# ! Warning: in the orignial file an extra file was added by mistake "gvcf_header.txt"
# ! In the current version this have been corrected
files = [file for file in files if '.gvcf.gz' in file]

#
print(files[:10])
print(len(files))

['4033478_23161_0_0.gvcf.gz', '4034157_23161_0_0.gvcf.gz', '4033534_23161_0_0.gvcf.gz', '4033567_23161_0_0.gvcf.gz', '4033730_23161_0_0.gvcf.gz', '4033749_23161_0_0.gvcf.gz', '4227854_23161_0_0.gvcf.gz', '4434022_23161_0_0.gvcf.gz', '4332322_23161_0_0.gvcf.gz', '4135509_23161_0_0.gvcf.gz']
49989


In [7]:
### Create the job instructions for the 50k qmap jobs

# Set files and folders needed
python_file = '/workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py'
coordinates = '/workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed'
input_folder = '/workspace/datasets/ukbiobank_ch/vcf_50k'
output_folder = '/workspace/datasets/ukbiobank_ch/vcf_ch_50k'

# Create list of commands
commands = []
for file in files:
    command = 'python ' + python_file + ' -i ' + input_folder + "/" + file + ' -i_coor ' + coordinates + ' -o ' + output_folder
    commands.append(command)

# Check
print(len(commands))
commands


49990


['python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4033478_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed -o /workspace/datasets/ukbiobank_ch/vcf_ch_50k',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4034157_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed -o /workspace/datasets/ukbiobank_ch/vcf_ch_50k',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4033534_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed -o /workspace/datasets/ukbiobank_ch/vcf_ch_50k',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_

In [19]:
### Set qmap file with parameters and jobs

qmap_pre_params = ['[pre]','. "/home/$USER/miniconda3/etc/profile.d/conda.sh"','conda activate clonalh','[params]','cores = 1','memory = 20G','[jobs]']
qmap_file = qmap_pre_params + commands
qmap_file

['[pre]',
 '. "/home/$USER/miniconda3/etc/profile.d/conda.sh"',
 'conda activate clonalh',
 '[params]',
 'cores = 1',
 'memory = 20G',
 '[jobs]',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4033478_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed -o /workspace/datasets/ukbiobank_ch/vcf_ch_50k',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4034157_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_transcripts_coordinates.bed -o /workspace/datasets/ukbiobank_ch/vcf_ch_50k',
 'python /workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_ch_genes.py -i /workspace/datasets/ukbiobank_ch/vcf_50k/4033534_23161_0_0.gvcf.gz -i_coor /workspace/datasets/ukbiobank_ch/analysis_50k_202103/CH_genes_canonical_tran

In [17]:
### Save qmap file

with open('/workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_chgenes_ukb.qmap', 'w') as f:
    for item in qmap_file:
        f.write('%s\n' % item)

In [20]:
### TESTS

# qmap file test
qmap_file2 = qmap_pre_params + commands[:25]
qmap_file2


#Save qmap test file

with open('/workspace/datasets/ukbiobank_ch/analysis_50k_202103/extract_chgenes_ukb_test25.qmap', 'w') as f:
    for item in qmap_file2:
        f.write('%s\n' % item)

## COMMAND

### $ qmap submit extract_chgenes_ukb.qmap -g 50