

## Code for producing the hormonal phenotyping matrix

E Flynn
5/24/2019

In [1]:
require('tidyverse')
require('data.table')
options(stringsAsFactors=FALSE)

Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.2     ✔ dplyr   0.7.4
✔ tidyr   0.8.1     ✔ stringr 1.3.0
✔ readr   1.1.1     ✔ forcats 0.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Loading required package: data.table

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

The following object is masked from ‘package:purrr’:

    transpose



### Extract Traits

In [2]:
phe_codes <- read.csv("../phe_extraction/ListPheCodes.csv")
head(phe_codes)
phe_codes$X <- NULL

sex_spec <- filter(phe_codes, category == "sex specific")
sex_spec


trait,category,variable_name,readable_name,X
3140,sex specific,pregnant,pregnant,
2724,sex specific,menopause,menopause,
3581,sex specific,menopause_age,age menopause,
3591,sex specific,hysterectomy,hysterectomy,
2834,sex specific,oophorectomy,oophorectomy,
3700,sex specific,time_since_period,time since last menstrual period,


trait,category,variable_name,readable_name
3140,sex specific,pregnant,pregnant
2724,sex specific,menopause,menopause
3581,sex specific,menopause_age,age menopause
3591,sex specific,hysterectomy,hysterectomy
2834,sex specific,oophorectomy,oophorectomy
3700,sex specific,time_since_period,time since last menstrual period
3710,sex specific,menstrual_length,length of menstrual cycle
3720,sex specific,menstruating,menstruating today
2804,sex specific,birth_control,age when last used the pill
3546,sex specific,HRT,age when last used HRT


#### Pregnancy data 

In [None]:
# FROM MASTER PHE
# for field_id in 3140 ; do zcat tab.columns.summary.20190418.tsv.gz | egrep "\sf.${field_id}"; done

# head -1 /oak/stanford/groups/mrivas/ukbb24983/phenotypedata/9796/21732/download/ukb21732.tab > tmp_colnames.txt

In [7]:
preg_cols <- read.delim("../tmp_colnames.txt")
length(preg_cols)
head(preg_cols)
preg_cols2 <- sapply(colnames(preg_cols), function(x) strsplit(as.character(x), ".", fixed=TRUE)[[1]][[2]]=="3140")
preg_cols2[[1]] <- TRUE
which(preg_cols2==TRUE) 
    
# extract these

“number of rows of result is not a multiple of vector length (arg 2)”

f.eid,f.21.0.0,f.21.1.0,f.21.2.0,f.23.0.0,f.23.1.0,f.23.2.0,f.31.0.0,f.34.0.0,f.48.0.0,⋯,f.41226.0.1,f.41226.0.2,f.41227.0.0,f.41227.0.1,f.41227.0.2,f.41228.0.0,f.41228.0.1,f.41228.0.2,f.41228.0.3,f.41252.0.0


In [None]:
# cut -f 1,753,754,755 /oak/stanford/groups/mrivas/ukbb24983/phenotypedata/9796/21732/download/ukb21732.tab > preg_data.txt

In [9]:
preg_dat <- fread("../phe_extraction/preg_data.txt")
head(preg_dat)

f.eid,f.3140.0.0,f.3140.1.0,f.3140.2.0
1918850,,,
2511282,,,
2840033,0.0,,
2312663,,,
4023158,0.0,,
5994491,,,


#### Sex-specific data

In [3]:
tab_file <- fread('/oak/stanford/groups/mrivas/ukbb24983/phenotypedata/2000269/21730/download/ukb21730.tab', data.table=FALSE)
# NOTE - this is SLOW
head(tab_file[,1:5])




f.eid,f.396.0.1,f.396.0.2,f.396.0.3,f.396.1.1
2829867,3,4,,
2809727,3,4,,
3025032,3,4,,
4662128,3,4,,
4621391,3,4,,
4563020,3,4,,


In [11]:
dim(tab_file)
list_cols <- colnames(tab_file)

In [12]:
list_traits <- sex_spec$trait

In [13]:
cols_keep <- sapply(list_cols, function(x) strsplit(x, ".", fixed=TRUE)[[1]][[2]] %in% list_traits)
table(cols_keep)

cols_keep
FALSE  TRUE 
 1099    27 

In [14]:
cols_keep[1] <- TRUE
dat <- tab_file[,cols_keep]
trait_counts <- table(sapply(colnames(dat), function(x) strsplit(x, ".", fixed=TRUE)[[1]][[2]] )) # three visits for each of nine traits
print(trait_counts)
setdiff(list_traits, names(trait_counts))


2724 2804 2834 3546 3581 3591 3700 3710 3720  eid 
   3    3    3    3    3    3    3    3    3    1 


In [15]:
head(dat)
rm(tab_file)



f.eid,f.2724.0.0,f.2724.1.0,f.2724.2.0,f.2804.0.0,f.2804.1.0,f.2804.2.0,f.2834.0.0,f.2834.1.0,f.2834.2.0,⋯,f.3591.2.0,f.3700.0.0,f.3700.1.0,f.3700.2.0,f.3710.0.0,f.3710.1.0,f.3710.2.0,f.3720.0.0,f.3720.1.0,f.3720.2.0
2829867,,,,,,,,,,⋯,,,,,,,,,,
2809727,0.0,,,30.0,,,0.0,,,⋯,,-1.0,,,-1.0,,,0.0,,
3025032,1.0,,,19.0,,,0.0,,,⋯,,,,,,,,,,
4662128,,,,,,,,,,⋯,,,,,,,,,,
4621391,1.0,,,,,,0.0,,,⋯,,,,,,,,,,
4563020,,,,,,,,,,⋯,,,,,,,,,,


In [16]:
# add the pregnancy data
dat2 <- full_join(preg_dat, dat)

Joining, by = "f.eid"


In [19]:
# count number of NAs per col
numNonNa <- apply(dat2, 2, function(x) sum(!is.na(x)))
numNonNa # some of these are v small!! to be aware of

In [20]:
# load other covariate data - age, sex
COVARIATE_MATRIX <- '/oak/stanford/groups/mrivas/ukbb24983/sqc/ukb24983_GWAS_covar.phe'
cov_mat <- read.table(COVARIATE_MATRIX, header=TRUE, stringsAsFactors=FALSE)
covar_data <- cov_mat[,c("IID","age", "sex")]




In [22]:
# write out these data
dat3 <- full_join(covar_data, dat2, c("IID"="f.eid"))
head(dat3)

IID,age,sex,f.3140.0.0,f.3140.1.0,f.3140.2.0,f.2724.0.0,f.2724.1.0,f.2724.2.0,f.2804.0.0,⋯,f.3591.2.0,f.3700.0.0,f.3700.1.0,f.3700.2.0,f.3710.0.0,f.3710.1.0,f.3710.2.0,f.3720.0.0,f.3720.1.0,f.3720.2.0
2502845,50,1,,,,,,,,⋯,,,,,,,,,,
2314965,55,0,0.0,,,3.0,,,27.0,⋯,,,,,,,,0.0,,
1142584,60,0,0.0,,,1.0,,,35.0,⋯,,,,,,,,,,
3665122,74,0,0.0,,,1.0,,,,⋯,,,,,,,,,,
4377492,65,0,0.0,,,1.0,,,52.0,⋯,,,,,,,,,,
2652670,58,1,,,,,,,,⋯,,,,,,,,,,


In [24]:
write.table(dat3, file="../phe_extraction/sex_spec_pheno.txt", row.names=FALSE, quote=FALSE, sep="\t")

TODO:
- Redo menopause phenotyping
- Remove pregnant folks

#### Look at visit to visit variability

In [5]:
# load both visits of the traits
df_phe <- biomarker_phe[,c("f.eid", sprintf("f.%s.0.0", phe_id))]  # structured as phe, visit, info, we want visit 1
print("extracted")
df_phe2 <- cbind(df_phe[,"f.eid"], df_phe)

# 


In [None]:
# for most of these: -1 or -3 = NA

list.hormone.related.phe.ids <- c(
    '3140', # pregnant (remove everyone with = yes, 2=unsure)
'2724', # had menopause (1 = yes, 0 = no, 2= hysterectomy, 3 = not sure)
'3581', # age of menopause 
'3591', # hysterectomy
'2834', # oophorectomy
'3700', # time since last menstural period
'3710', # length of menstrual cycle (-6 = irregular cycle)
'3720', # menstruating today
'2804', # age when last used the pill (-11 = still taking)
'3546', # age when last used HRT (-11 = still taking)
)

other.covariates <- c(
# age
# sex
# BMI
# BP
'2178', # health rating (1 = excellent, 2 = good, 3 = fair, 4 = poor)
'2188', # long-standing illness or disability
)

other.quant.traits <- c(
)

# look at these for all the dates