## Toy example for Alzheimers Disease

## Aim

Create a toy example with the AD cases and 1000 controls from the UKBB

In [2]:
#Load libraries
library(plyr)
library(tidyverse)
library(pander)
library(ggpubr)
library(rapportools)
library(ggplot2)
#Get working directory
getwd()
#Set working directory
setwd('/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020/')

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.1     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.0.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘tibble’ was built under R version 3.6.3”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr[39m::[32mfailwith()[39m  masks [34mplyr[39m::failwith()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mid()[39m      

In [3]:
# Clean workspace
rm(list=ls())

In [4]:
# Run script to import data to R
source("ukb42495.r")

In [5]:
dim(bd)

In [6]:
# List of individuals with qc'ed genotypic files
df.geno <- read.table("/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated082020removedwithdrawnindiv.fam", header= FALSE, stringsAsFactors = FALSE)
names(df.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(df.geno)

In [7]:
head(bd[,1, drop=FALSE])

Unnamed: 0_level_0,f.eid
Unnamed: 0_level_1,<int>
1,6025442
2,1000019
3,1000022
4,1000035
5,1000046
6,1000054


In [8]:
# Assign individual ID column to bd f.eid
names(bd)[1] <- "IID"
head(bd[,1, drop=FALSE])

Unnamed: 0_level_0,IID
Unnamed: 0_level_1,<int>
1,6025442
2,1000019
3,1000022
4,1000035
5,1000046
6,1000054


In [9]:
# Merge the two data frames
df.gen.phen <-merge(df.geno, bd, by="IID", all=FALSE)
nrow(df.gen.phen)

In [10]:
# ICD10 f.41270
head(df.gen.phen$f.41270.0.0)

In [11]:
# Select cases using IC10 code
ad_cases <- df.gen.phen %>%
  mutate (cases = apply(select(., starts_with("f.41270")), 1, function(x) any(x %in% c("G300", "G301", "G308", "G309")))) %>%
  filter (cases == TRUE)%>%
  mutate(cases = as.numeric(cases))
head(ad_cases)
dim(ad_cases)

Unnamed: 0_level_0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.21.0.0,f.21.1.0,f.21.2.0,f.21.3.0,⋯,f.130715.0.0,f.130716.0.0,f.130717.0.0,f.131494.0.0,f.131495.0.0,f.131496.0.0,f.131497.0.0,f.132202.0.0,f.132203.0.0,cases
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<ord>,<ord>,<ord>,<ord>,⋯,<ord>,<date>,<ord>,<date>,<ord>,<date>,<ord>,<date>,<ord>,<dbl>
1,1001290,1001290,0,0,2,-9,Direct entry,,,,⋯,,,,,,,,,,1
2,1004098,1004098,0,0,1,-9,Direct entry,,,,⋯,,,,,,,,,,1
3,1005172,1005172,0,0,2,-9,Direct entry,,,,⋯,,,,,,,,,,1
4,1014853,1014853,0,0,1,-9,Direct entry,,,,⋯,,,,,,,,,,1
5,1030103,1030103,0,0,2,-9,Direct entry,,,,⋯,,,,,,,,,,1
6,1042356,1042356,0,0,2,-9,Direct entry,,,,⋯,,,,1990-11-01,Self-report and other source(s),,,,,1


In [12]:
#Select controls randomly
ad_controls <- df.gen.phen %>%
  mutate (cases = apply(select(., starts_with("f.41270")), 1, function(x) any(x %in% c("G300", "G301", "G308", "G309")))) %>%
  filter (cases == FALSE) %>%
  sample_n(., 10000) %>%
  mutate(cases = as.numeric(cases))
head(ad_controls)
dim(ad_controls)

Unnamed: 0_level_0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.21.0.0,f.21.1.0,f.21.2.0,f.21.3.0,⋯,f.130715.0.0,f.130716.0.0,f.130717.0.0,f.131494.0.0,f.131495.0.0,f.131496.0.0,f.131497.0.0,f.132202.0.0,f.132203.0.0,cases
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<ord>,<ord>,<ord>,<ord>,⋯,<ord>,<date>,<ord>,<date>,<ord>,<date>,<ord>,<date>,<ord>,<dbl>
1,5560557,5560557,0,0,1,-9,Direct entry,,,,⋯,,,,,,,,,,0
2,2124901,2124901,0,0,1,-9,Direct entry,Direct entry,,,⋯,,,,,,,,,,0
3,5427067,5427067,0,0,1,-9,Direct entry,,,,⋯,,,,,,,,,,0
4,4386047,4386047,0,0,1,-9,Direct entry,,,,⋯,,,,,,,,,,0
5,4942545,4942545,0,0,2,-9,Direct entry,,,,⋯,,,,,,,,,,0
6,4925168,4925168,0,0,2,-9,Direct entry,,,,⋯,,,,,,,,,,0


In [13]:
ad_toy <- rbind(ad_cases, ad_controls)

In [14]:
## Export files

In [17]:
# Step 5 Save as csv file
write.csv(ad_toy,'090920_UKBB_AD_toy.csv', row.names = FALSE)

In [18]:
write.csv(df.gen.phen,'090920_ukb42495_genotypeqc.csv', row.names = FALSE)