# Filter specific individuals and SNPs from the exome data

## How to run this notebook

```
sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb extract_region\
    --cwd ~/test \
    --bedfiles ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c5.merged.filtered.bed\
    --region 5_272741_1213528 \
    --chrom 5\
    --start_pos 272741\
    --end_pos 1213528\
    --job_size 1 \
    --numThreads 2
```

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Load Plink module from cluster
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module PLINK2 loaded"
{cmd}
'''
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''

# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:2.9'
parameter: container_marp = 'gaow/marp'

In [None]:
#select individuals and filter specific snps
[extract_snps]
# Snps to extract
parameter: snp_list = path
parameter: sample_list = path
input: bedfiles
output: f'{cwd}/{_input:bn}.extract.raw'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
      --bfile ${_input:n}  \
      --keep ${sample_list} \
      --extract ${snp_list} --range \
      --recodeA \
      --threads ${numThreads} \
      --out ${_output:n} 

In [None]:
[extract_region]
parameter: region = str
parameter: chrom = int
parameter: start_pos = int
parameter: end_pos = int
input: bedfiles
output: f'{cwd}/{_input:bn}.{region}.bed'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container_lmm, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
      --bfile ${_input:n}  \
      --chr ${chrom} \
      --from-bp ${start_pos}\
      --to-bp ${end_pos}\
      --make-bed \
      --threads ${numThreads} \
      --out ${_output:n} 

In [4]:
module load Singularity
sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb extract_region\
    --cwd ~/test \
    --bedfiles ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c5.merged.filtered.bed\
    --region 5_272741_1213528 \
    --chrom 5\
    --start_pos 272741\
    --end_pos 1213528\
    --job_size 1 \
    --numThreads 2\
    --container_lmm ~/containers/lmm.sif

INFO: Running [32mextract_region[0m: 
INFO: [32mextract_region[0m is [32mcompleted[0m.
INFO: [32mextract_region[0m output:   [32m/home/dmc2245/test/ukb23156_c5.merged.filtered.5_272741_1213528.bed[0m
INFO: Workflow extract_region (ID=w7e6c22bb0452895c) is executed successfully with 1 completed step.



In [1]:
# Now check if those individuals have mental retardation
setwd("/home/dc2325/scratch60/plink_extract")
pheno <- read.csv("individuals_test.csv", sep=",", header=TRUE)
nrow(pheno)

In [52]:
head(pheno[,1:20])

Unnamed: 0_level_0,eid,X4689.0.0.x,X4689.1.0.x,X4689.2.0.x,X4689.3.0.x,X5194.0.0.x,X5194.1.0.x,X5196.0.0.x,X5196.1.0.x,X5253.0.0.x,X5254.0.0.x,X5254.1.0.x,X5255.0.0.x,X5255.1.0.x,X5256.0.0.x,X5256.1.0.x,X5257.0.0.x,X5257.1.0.x,X5258.0.0.x,X5258.1.0.x
Unnamed: 0_level_1,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,2237835,,,,,,,,,,,,,,,,,,,
2,3927542,,,,,,,,,,,,,,,,,,,


In [12]:
library(dplyr)
icd10 <- pheno %>%
    select("eid", starts_with("X41270"))
dim(icd10)

In [46]:
head(icd10[,1:13])

Unnamed: 0_level_0,eid,X41270.0.0.x,X41270.0.1.x,X41270.0.2.x,X41270.0.3.x,X41270.0.4.x,X41270.0.5.x,X41270.0.6.x,X41270.0.7.x,X41270.0.8.x,X41270.0.9.x,X41270.0.10.x,X41270.0.11.x
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<lgl>,<lgl>,<lgl>
1,2237835,H118,,,,,,,,,,,
2,3927542,D649,E059,L030,M8195,S7210,T920,W010,Y831,Z896,,,


In [49]:
icd10$present <- !!rowSums(sapply(icd10[1:2,], grepl, pattern = "H11"))

In [50]:
head(icd10$present)

In [42]:
head(icd10[,213:215])

Unnamed: 0_level_0,X41270.0.211.x,X41270.0.212.x,present
Unnamed: 0_level_1,<lgl>,<lgl>,<lgl>
1,,,False
2,,,False


## Check variants for Finnish study Isabelle

In [9]:
#bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
cwd=/home/dc2325/scratch60/isabelle_finnish
snp_list=/home/dc2325/scratch60/isabelle_finnish/snps_extract.txt

sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb  \
    --cwd $cwd \
    --bedfiles /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c3_b0_v1.bed\
    --snp_list $snp_list

INFO: Running [32mdefault[0m: select individuals and filter specific snps
INFO: [32mdefault[0m is [32mcompleted[0m.
INFO: [32mdefault[0m output:   [32m/home/dc2325/scratch60/isabelle_finnish/ukb23155_c3_b0_v1.extract.raw[0m
INFO: Workflow default (ID=w41d699249e2b581d) is executed successfully with 1 completed step.


In [16]:
cat /home/dc2325/scratch60/isabelle_finnish/ukb23155_c3_b0_v1.extract.raw | head

FID IID PAT MAT SEX PHENOTYPE 3:49006137:C:T_T
1434748 1434748 0 0 2 -9 0
5523981 5523981 0 0 1 -9 0
5023838 5023838 0 0 2 -9 0
4023729 4023729 0 0 1 -9 0
4442146 4442146 0 0 2 -9 0
5654789 5654789 0 0 2 -9 0
4515669 4515669 0 0 2 -9 0
1129683 1129683 0 0 1 -9 0
5327043 5327043 0 0 2 -9 0
cat: write error: Broken pipe


In [19]:
awk '$7==2 {print}' /home/dc2325/scratch60/isabelle_finnish/ukb23155_c3_b0_v1.extract.raw

In [11]:
#bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
cwd=/home/dc2325/scratch60/isabelle_finnish
snp_list=/home/dc2325/scratch60/isabelle_finnish/snps_extract.txt

sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb  \
    --cwd $cwd \
    --bedfiles /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c7_b0_v1.bed\
    --snp_list $snp_list

INFO: Running [32mdefault[0m: select individuals and filter specific snps
INFO: [32mdefault[0m is [32mcompleted[0m.
INFO: [32mdefault[0m output:   [32m/home/dc2325/scratch60/isabelle_finnish/ukb23155_c7_b0_v1.extract.raw[0m
INFO: Workflow default (ID=wf05336cfa9ceef9c) is executed successfully with 1 completed step.


In [17]:
cat /home/dc2325/scratch60/isabelle_finnish/ukb23155_c7_b0_v1.extract.raw | head

FID IID PAT MAT SEX PHENOTYPE 7:106099254:C:T_T
1434748 1434748 0 0 2 -9 0
5523981 5523981 0 0 1 -9 0
5023838 5023838 0 0 2 -9 0
4023729 4023729 0 0 1 -9 0
4442146 4442146 0 0 2 -9 0
5654789 5654789 0 0 2 -9 0
4515669 4515669 0 0 2 -9 0
1129683 1129683 0 0 1 -9 0
5327043 5327043 0 0 2 -9 0
cat: write error: Broken pipe


In [21]:
awk '$7==2 {print}' /home/dc2325/scratch60/isabelle_finnish/ukb23155_c7_b0_v1.extract.raw

In [13]:
#bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
cwd=/home/dc2325/scratch60/isabelle_finnish
snp_list=/home/dc2325/scratch60/isabelle_finnish/snps_extract.txt

sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb  \
    --cwd $cwd \
    --bedfiles /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c20_b0_v1.bed\
    --snp_list $snp_list

INFO: Running [32mdefault[0m: select individuals and filter specific snps
[91mERROR[0m: [91mdefault (id=b883a05498325d46) returns an error.[0m
[91mERROR[0m: [91m[default]: [0]: 
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
script_4000695183850138131 in <module>
        --out {_output:n}
      
----> """, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module)

FileNotFoundError: 2[0m


: 1

In [14]:
#bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
cwd=/home/dc2325/scratch60/isabelle_finnish
snp_list=/home/dc2325/scratch60/isabelle_finnish/snps_extract.txt

sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb  \
    --cwd $cwd \
    --bedfiles /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c14_b0_v1.bed\
    --snp_list $snp_list

INFO: Running [32mdefault[0m: select individuals and filter specific snps
INFO: [32mdefault[0m is [32mcompleted[0m.
INFO: [32mdefault[0m output:   [32m/home/dc2325/scratch60/isabelle_finnish/ukb23155_c14_b0_v1.extract.raw[0m
INFO: Workflow default (ID=w3a79d953ba363d47) is executed successfully with 1 completed step.


In [18]:
cat /home/dc2325/scratch60/isabelle_finnish/ukb23155_c14_b0_v1.extract.raw | head

FID IID PAT MAT SEX PHENOTYPE 14:88578149:T:C_C
1434748 1434748 0 0 2 -9 0
5523981 5523981 0 0 1 -9 0
5023838 5023838 0 0 2 -9 0
4023729 4023729 0 0 1 -9 0
4442146 4442146 0 0 2 -9 0
5654789 5654789 0 0 2 -9 0
4515669 4515669 0 0 2 -9 0
1129683 1129683 0 0 1 -9 0
5327043 5327043 0 0 2 -9 0
cat: write error: Broken pipe


In [22]:
awk '$7==2 {print}' /home/dc2325/scratch60/isabelle_finnish/ukb23155_c14_b0_v1.extract.raw

# Check variants HI

In [5]:
#bedfiles=`echo /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c{1..22}_b0_v1.bed`
cwd=/home/dc2325/scratch60/output
snp_list=/home/dc2325/scratch60/output/snps_extract.txt

sos run ~/project/UKBB_GWAS_dev/workflow/plink_extract.ipynb  \
    --cwd $cwd \
    --bedfiles /gpfs/gibbs/pi/dewan/data/UKBiobank/genotype_files/ukb28374_exomedata/exome_data_OCT2020/ukb23155_c6_b0_v1.bed\
    --snp_list $snp_list

INFO: Running [32mdefault[0m: select individuals and filter specific snps
INFO: [32mdefault[0m is [32mcompleted[0m.
INFO: [32mdefault[0m output:   [32m/home/dc2325/scratch60/output/ukb23155_c6_b0_v1.extract.raw[0m
INFO: Workflow default (ID=w4b130b0fe2c619d6) is executed successfully with 1 completed step.



In [3]:
touch /home/dc2325/scratch60/output/snps_extract.txt
echo '6 75841299 75841299 rs121912560' > /home/dc2325/scratch60/output/snps_extract.txt


