### ExamineAgeMenopause
##### E Flynn
##### 7/2/2018

The goal of this notebook is to bin the data by age, and then also into pre-post menopause, and examine these differences. 

Then, we will set up the phenotype definition for females and menopause:
* Pre-menopause: stated they have not reached menopause -AND- are less than 60 years old
* Post-menopause: >2 years post menopause -AND- had menopause after age 40

Others:
* Peri-menopause: w/in 2y menopause
* Premature_menopause: menopause less than age 40
* Unlikely_pre: greater than 60y and says that not reached menopause
* Missing: prefer not to answer or missing

In [7]:
require('tidyverse')
options(stringsAsFactors=FALSE)
require('dplyr')
require('reshape2')

In [2]:
ss_phe <- read.table("../phe_extraction/sex_spec_pheno.txt", header=TRUE)



In [None]:
head(ss_phe)

In [5]:
ss_phe_long <- melt(ss_phe, id.vars=c("IID", "age", "sex"))
head(ss_phe_long)

IID,age,sex,variable,value
2502845,50,1,f.3140.0.0,
2314965,55,0,f.3140.0.0,0.0
1142584,60,0,f.3140.0.0,0.0
3665122,74,0,f.3140.0.0,0.0
4377492,65,0,f.3140.0.0,0.0
2652670,58,1,f.3140.0.0,


In [6]:
ss_phe_long$variable <- as.character(ss_phe_long$variable)

In [7]:
ss_phe_long2 <- ss_phe_long %>% separate(variable, c(NA, "trait", "visit", NA))
head(ss_phe_long2)

IID,age,sex,NA,trait,visit,NA.1,value
2502845,50,1,f,3140,0,0,
2314965,55,0,f,3140,0,0,0.0
1142584,60,0,f,3140,0,0,0.0
3665122,74,0,f,3140,0,0,0.0
4377492,65,0,f,3140,0,0,0.0
2652670,58,1,f,3140,0,0,


In [8]:
ss_mat <- dcast(ss_phe_long2,  IID + visit ~ trait, value.var="value")

In [10]:
# remove rows w all NAsz
keep.rows <- apply(ss_mat[,3:ncol(ss_mat)], 1, function(x) any(!is.na(x)))
table(keep.rows)
ss_mat2 <- ss_mat[keep.rows,]



keep.rows
  FALSE    TRUE 
1212655  295235 

In [12]:
# remove rows w IID < 0
ss_mat3 <- filter(ss_mat2, IID >= 0)
head(ss_mat3)

ss_mat4 <- full_join(select(ss_phe, IID, sex, age), ss_mat3)

IID,visit,2724,2804,2834,3140,3546,3581,3591,3700,3710,3720
1000028,0,0,26.0,0,0,,,0,20.0,28.0,0.0
1000045,0,1,-1.0,0,0,60.0,54.0,0,,,
1000052,0,1,-1.0,0,0,-1.0,-1.0,0,,,
1000069,0,1,26.0,0,0,,-1.0,1,,,
1000087,0,1,,0,0,,50.0,0,,,
1000118,0,1,-1.0,0,0,-1.0,53.0,0,,,


Joining, by = "IID"


In [15]:
# TODO - age is age at visit 0
# TOOD - change coluymn names
write.table(ss_mat4, file="../data/sex_spec_factor_mat.txt", row.names=FALSE, quote=FALSE, sep="\t")

In [5]:
ss_mat4 <- read.delim(file="../data/sex_spec_factor_mat.txt")

head(ss_mat4)
counts <- sapply(ss_mat4[,2:ncol(ss_mat4)], table)
colnames(ss_mat4)
table(ss_mat4[,c("visit", "X3140")])

IID,sex,age,visit,X2724,X2804,X2834,X3140,X3546,X3581,X3591,X3700,X3710,X3720
2502845,1,50,,,,,,,,,,,
2314965,0,55,0.0,3.0,27.0,0.0,0.0,,,0.0,,,0.0
1142584,0,60,0.0,1.0,35.0,0.0,0.0,52.0,45.0,0.0,,,
3665122,0,74,0.0,1.0,,0.0,0.0,55.0,42.0,1.0,,,
4377492,0,65,0.0,1.0,52.0,0.0,0.0,,52.0,0.0,,,
2652670,1,58,,,,,,,,,,,


     X3140
visit      0      1      2
    0 272246    150    222
    1  10399      1      1
    2  11312      1      5

In [9]:
phe_codes <- read.csv("../phe_extraction/ListPheCodes.csv", stringsAsFactors=FALSE)
phe_codes$X <- NULL

sex_spec <- filter(phe_codes, category == "sex specific")
sex_spec

#trait_to_name <- split(sex_spec$variable_name, sex_spec$trait)


trait,category,variable_name,readable_name
3140,sex specific,pregnant,pregnant
2724,sex specific,menopause,menopause
3581,sex specific,menopause_age,age menopause
3591,sex specific,hysterectomy,hysterectomy
2834,sex specific,oophorectomy,oophorectomy
3700,sex specific,time_since_period,time since last menstrual period
3710,sex specific,menstrual_length,length of menstrual cycle
3720,sex specific,menstruating,menstruating today
2804,sex specific,birth_control,age when last used the pill
3546,sex specific,HRT,age when last used HRT


## CREATE DERIVED PHENOTYPES

### Pregnant, Pill, or HRT

In [12]:
head(filter(ss_mat4, X3140==0))
table(is.na(ss_mat4$X3140) )

IID,sex,age,visit,X2724,X2804,X2834,X3140,X3546,X3581,X3591,X3700,X3710,X3720
2314965,0,55,0,3,27.0,0,0,,,0,,,0.0
1142584,0,60,0,1,35.0,0,0,52.0,45.0,0,,,
3665122,0,74,0,1,,0,0,55.0,42.0,1,,,
4377492,0,65,0,1,52.0,0,0,,52.0,0,,,
4015997,0,50,0,0,33.0,0,0,,,0,0.0,28.0,1.0
2076736,0,60,0,1,45.0,0,0,,45.0,0,,,



 FALSE   TRUE 
294337 230245 

In [24]:
# DERIVED PHENOTYPES:
#    preg: 1 = possibly or confirmed pregnant (exclude!), 0 = not pregnant, NA = did not answer 
#    hrt / pill:  1= still taking, 0 = not still taking, -1 = time not known, NA = did not answer
ss_mat5 <- ss_mat4 %>% mutate(
    preg=ifelse(X3140==1 | X3140==2, 1, X3140),
    pill=ifelse(X2804==-11, 1, ifelse(X2804 == -3 | X2804==-1,-9, 0)),
    hrt=ifelse(X3546==-11, 1, ifelse(X3546 == -3 | X3546==-1, -9, 0)))
table(ss_mat5$preg)
table(ss_mat5$hrt)
table(ss_mat5$pill)


     0      1 
293957    380 


   -9     0     1 
11233 83604 17571 


    -9      0      1 
 24424 208828   5099 

### Menopause

In [32]:
#head(ss_mat5)

ss_mat6 <- ss_mat5 %>% mutate(
    ooph= ifelse(X2834 == -3 | X2834==-5,-9, X2834),
    hyster= ifelse(X3591 == -3 | X3591==-5,-9, X3591),
    meno = ifelse(X2724 ==-3 | X2724==3,NA, ifelse(X2724 == 2 | X2724== 1, 1, X2724)), # hysterectomy / yes --> yes; not sure/prefer not --> -9
    hyster2= ifelse(X2724==2, 1, 0),
    meno.age=ifelse(X3581 == -3 | X3581==-1,-9, X3581)
    )

head(ss_mat6)
ss_mat7 <- mutate(ss_mat6, 
                  years.post=ifelse(meno.age<0, NA, age - meno.age),# compute the years since menopause
                  surgical.meno=ifelse(ooph == 1 | hyster==1 | hyster2 == 1,1,ifelse(X2834 == -9 & X3591==-9, NA, 0))
                 ) 



IID,sex,age,visit,X2724,X2804,X2834,X3140,X3546,X3581,⋯,X3710,X3720,preg,pill,hrt,ooph,hyster,meno,hyster2,meno.age
2502845,1,50,,,,,,,,⋯,,,,,,,,,,
2314965,0,55,0.0,3.0,27.0,0.0,0.0,,,⋯,,0.0,0.0,0.0,,0.0,0.0,,0.0,
1142584,0,60,0.0,1.0,35.0,0.0,0.0,52.0,45.0,⋯,,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,45.0
3665122,0,74,0.0,1.0,,0.0,0.0,55.0,42.0,⋯,,,0.0,,0.0,0.0,1.0,1.0,0.0,42.0
4377492,0,65,0.0,1.0,52.0,0.0,0.0,,52.0,⋯,,,0.0,0.0,,0.0,0.0,1.0,0.0,52.0
2652670,1,58,,,,,,,,⋯,,,,,,,,,,


In [40]:
label_col <- function(meno, meno.age, age, sex, years.post, surgical.meno){
(if (sex==1 & !is.na(sex)){
    "male"
} else if (any(sapply(c(meno, age, sex), is.na))){
    "missing"
} else if (surgical.meno==1){
    "surgical_meno"
} else if (meno==0){
    ifelse(age > 60, "likely_meno", "pre")
} else {
    if (is.na(meno.age)) {
        "missing_age"
    } else{
           ifelse(meno.age <= 40, "premature",
           ifelse(years.post < 2 | is.na(years.post), "peri", "post")
          ) 
    }


})}
 


df <- ss_mat7
df2 <- df
df2$meno.label <- mapply(label_col, df$meno, df$meno.age, df$age, df$sex, df$years.post, df$surgical.meno)
meno_df <- df2 %>% select(IID, sex, age, visit, preg, pill, hrt, ooph, hyster, meno, hyster2, meno.age, years.post, surgical.meno, meno.label)

In [42]:
table(meno_df$meno.label)
meno_df <- rename(meno_df, "age_v0" ="age")
head(meno_df)


  likely_meno          male       missing   missing_age          peri 
         6333        223531         27851            37           125 
         post           pre     premature surgical_meno 
       140462         57225         14673         54345 

IID,sex,age_v0,visit,preg,pill,hrt,ooph,hyster,meno,hyster2,meno.age,years.post,surgical.meno,meno.label
2502845,1,50,,,,,,,,,,,,male
2314965,0,55,0.0,0.0,0.0,,0.0,0.0,,0.0,,,0.0,missing
1142584,0,60,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,45.0,15.0,0.0,post
3665122,0,74,0.0,0.0,,0.0,0.0,1.0,1.0,0.0,42.0,32.0,1.0,surgical_meno
4377492,0,65,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,52.0,13.0,0.0,post
2652670,1,58,,,,,,,,,,,,male


In [43]:
write.table(meno_df, file="../phe_extraction/menopause_phe_table.txt", sep="\t", quote=FALSE, row.names=FALSE)

## SANITY CHECKING

In [None]:

# FOR EACH VISIT

# 1) hysterectomy == menopause due to hysterectomy

# 2) oophorectomy == menopause

# 3) menopause, oophorectomy, hysterectomy, pregnant != menstruating, menstrual period recently 

# 4) hysterectomy != pregnant

# 5) HRT vs menopause (expect mostly post-menopausal)

# 6) time_since_period <= menstrual_length + 3

# 7) 10 < cycle_length < 60

# 8) birth_control == pre_menopause

# [ age at menopause < current age  for first visit ]


# EXPLORATORY
# - what do irregular cycles look like?
# - does anyone report irregular cycles on the pill?

# MENSTRUAL PHASE

In [None]:
# NOTE - these questions are poorly worded and have strange cutoffs...

# time since last period: 3700 http://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=3700
# length of menstrual cycle: 3710
# menstruating today: 3720

# We may need to go back to original data

meno_today <- read.table("../phefiles/ss/BIN_FC10003720.phe")
meno_length <- read.table("../phefiles/ss/INI3710.phe")
meno_time <- read.table("../phefiles/ss/INI3700.phe")

colnames(meno_time) <- c("ID", "ID2", "day_in_cycle")
colnames(meno_length) <- c("ID", "ID2", "cycle_length")
colnames(meno_today) <- c("ID", "ID2", "meno_today")


In [None]:
length_time <- full_join(meno_time %>% select(-ID2), meno_length %>% select(-ID2), by="ID")
cycle_info <- full_join(length_time, meno_today %>% select(-ID2), by="ID")

In [None]:
head(cycle_info)
table(cycle_info$cycle_length==-9)
table(cycle_info$meno_today==-9)

table(cycle_info$day_in_cycle==-9)

cycle_present <- filter(cycle_info, cycle_length!=-9 & day_in_cycle!=-9 & meno_today !=-9)

In [None]:
head(cycle_present)
table(cycle_present$meno_today) # 2 == yes, 1 == no

# HOWEVER - for a subset, we are fairly confident
# meno_length < 60 & meno_length > 14
range_filt <- cycle_present %>% filter(cycle_length <= 60 & cycle_length > 14)

In [None]:
cycle_labeled <- range_filt %>% mutate(approx_ov = cycle_length/2) %>% 
mutate(phase=ifelse(meno_today==2, "menstrual", ifelse(day_in_cycle < approx_ov, "follicular", "luteal")))

In [None]:
head(cycle_labeled)
table(cycle_labeled$phase)
write.table(cycle_labeled, file="../data/menstrual_phase.txt", sep="\t", row.names=FALSE, quote=FALSE)

In [None]:

# sanity check this
filter(range_filt, meno_today==2) %>% head()


In [None]:
#meno_today and meno_time to validate - should be a distribution
ggplot(filter(range_filt, meno_today==2), aes(day_in_cycle))+geom_histogram(binwidth=1)
### disproportionately day 0! I think ppl are mis-interpreting, estimate this

In [None]:
ggplot(range_filt, aes(cycle_length))+geom_histogram(binwidth=5)
ggplot(range_filt, aes(day_in_cycle))+geom_histogram(binwidth=5)

In [None]:
filter(meno_time, val < 0) %>% head()
filter(meno_length, val < 0) %>% head()
table(meno_today$val)

In [None]:
ggplot(meno_time, aes(val))+geom_histogram(binwidth=5)+xlim(-10, 60)+ylim(0, 1000)

#### Look at repeat visits

In [None]:
# 2nd visit

# how many are still pre-menopause?

# phase information