# Filter specific individuals and SNPs from the exome data

In [None]:
[global]
# the output directory for generated files
parameter: cwd = path
# BED Plink files for exome data
parameter: bedfiles = path
# BIM Plink files for exome data
parameter: bimfiles = path
# The fam file associated to the bed files
parameter: famFile = path 
# Snps to extract
parameter: snp_list = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Number of threads
parameter: numThreads = 1
# Load Plink module from cluster
parameter: plink2_module = '''
module load PLINK/2_x86_64_20180428
echo "Module PLINK2 loaded"
{cmd}
'''
parameter: plink_module = '''
module load PLINK/1.90-beta5.3
echo "Module plink loaded"
{cmd}
'''

# Software container option
parameter: container_lmm = 'statisticalgenetics/lmm:1.4'
parameter: container_marp = 'gaow/marp'

In [None]:
#select individuals and filter specific snps
[default]
input: bedfiles, bimfiles, famFile
output: f'{cwd}/{_input[0]:bn}.extract.raw'
task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash:expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template = '{cmd}' if executable('plink').target_exists() else plink_module
    plink \
      --bed ${_input[0]}  --bim ${_input[1]} --fam ${_input[2]} \
      --extract ${snp_list} --range \
      --recodeA \
      --threads ${numThreads} \
      --out ${_output:n} 

In [1]:
# Now check if those individuals have mental retardation
setwd("/home/dc2325/scratch60/plink_extract")
pheno <- read.csv("individuals_test.csv", sep=",", header=TRUE)
nrow(pheno)

In [52]:
head(pheno[,1:20])

Unnamed: 0_level_0,eid,X4689.0.0.x,X4689.1.0.x,X4689.2.0.x,X4689.3.0.x,X5194.0.0.x,X5194.1.0.x,X5196.0.0.x,X5196.1.0.x,X5253.0.0.x,X5254.0.0.x,X5254.1.0.x,X5255.0.0.x,X5255.1.0.x,X5256.0.0.x,X5256.1.0.x,X5257.0.0.x,X5257.1.0.x,X5258.0.0.x,X5258.1.0.x
Unnamed: 0_level_1,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
1,2237835,,,,,,,,,,,,,,,,,,,
2,3927542,,,,,,,,,,,,,,,,,,,


In [12]:
library(dplyr)
icd10 <- pheno %>%
    select("eid", starts_with("X41270"))
dim(icd10)

In [46]:
head(icd10[,1:13])

Unnamed: 0_level_0,eid,X41270.0.0.x,X41270.0.1.x,X41270.0.2.x,X41270.0.3.x,X41270.0.4.x,X41270.0.5.x,X41270.0.6.x,X41270.0.7.x,X41270.0.8.x,X41270.0.9.x,X41270.0.10.x,X41270.0.11.x
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<lgl>,<lgl>,<lgl>
1,2237835,H118,,,,,,,,,,,
2,3927542,D649,E059,L030,M8195,S7210,T920,W010,Y831,Z896,,,


In [49]:
icd10$present <- !!rowSums(sapply(icd10[1:2,], grepl, pattern = "H11"))

In [50]:
head(icd10$present)

In [42]:
head(icd10[,213:215])

Unnamed: 0_level_0,X41270.0.211.x,X41270.0.212.x,present
Unnamed: 0_level_1,<lgl>,<lgl>,<lgl>
1,,,False
2,,,False
