# Phenotypes:
1. Hearing aids (f.3393)
2. Hearing difficulty/problems (f.2247)
3. Hearing difficulty/background noise (f.2257)

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for diverse hearing related phenotyes to perform association analyses using the LMM.ipynb. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/HI_UKBB
```

## Important phenotypic files

1. `200804_UKBB_HI_genotypeqc.csv` File containing all individuals that passed QC and hearing impairment variables
2. `200804_UKBB_HI_genotypeqc_excr.csv` File with applied exclusion criteria as indicated [here](https://docs.google.com/document/d/1cpxTzElpsEkwmBDjnMBHg2wW7CL1AcG_b0_0wE_k5rQ/edit). **Note**: this file excludes individuals with otosclerosis, Meniere's and other diseases, if you need to filter those particular phenotypes use file 1 instead.
3. `200811_UKBB_Tinnitus_plan1_2_3_f4803` File with filtered phenotypes for tinnitus plan 1,2 and 3 and imputed noise variables
4. `200814_UKBB_HI_genotypeqc_excr_impvars` Database with qc'ed individuals, exclusion criteria, noise imputed vars and tinnitus phenotypes

## Analysis plan

The phenotypes to be analyzed are the following:

1. Hearing aid user (f.3393)
"Do you use a hearing aid most of the time?"

2. Hearing difficulty/problems (f.2247)
"Do you have any difficulty with your hearing?"

3. Hearing difficulty/background noise (f.2257)
"Do you find it difficult to follow a conversation if there is background noise (such as TV, radio, children playing)?"




## Load libraries and set working dir

In [None]:
#Load libraries
library(plyr)
library(tidyverse)
library(pander)
library(ggpubr)
library(rapportools)
library(ggplot2)
#Get working directory
getwd()
#Set working directory
setwd('~/project/HI_UKBB')

In [None]:
df.final.imp = read.csv('/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/hearing_impairment/200814_UKBB_HI_genotypeqc_excr_impvars', header=TRUE)

step 1: classify cases and controls

In [None]:
library(tidyverse)

hearing_all = data.geno.pheno %>% 
  select(IID,FID,f.31.0.0, f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0, f.2257.1.0, f.2257.2.0, f.2257.3.0) 


#classify the cases and controls for 3393 (hearing aid)
hearing_all = hearing_all %>% 
  mutate(cases = apply(select(hearing_all,starts_with("f.3393")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))
  )


hearing_all = hearing_all %>% 
      mutate(control = ifelse(apply(hearing_all, 1, function(x){
              col_names = names(x)[grep("f.3393", names(x))]
              condition1 = sapply(col_names, function(y){
                x[y] %in% c("No",NA)
              })
              
              condition2 = sapply(col_names, function(y){
                x[y] %in% c(NA)
              })
              
              all(condition1) & !all(condition2)
            }),
            "FALSE",
            NA
            )
         ) 

                                     
#hearing_all$cases = apply(select(hearing_all,starts_with("f.3393")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))

#12460 individuals are cases for 3393
hearing_aid_cases <- hearing_all %>% 
  filter(cases == "TRUE")


#217892 individuals are controls for 3393
hearing_control <- hearing_all %>% 
  filter(control == "FALSE") %>% 
  select(-cases)

#230352 are either cases or controls for 3393
hearing_all <- hearing_all %>% 
  filter(cases == "TRUE" | control == "FALSE")

In [None]:
#merge cases and controls for 3393
library(dplyr)
hearing_all$hearing_aid_cat <- coalesce(hearing_all$cases, as.logical(hearing_all$control))

hearing_all <- hearing_all %>% 
  select(-cases, -control) %>% 
  mutate(hearing_aid_cat = as.factor(hearing_aid_cat),
         hearing_aid_cat = recode(hearing_aid_cat,"FALSE"="control","TRUE"="case"))


# Get the number of NAs
length(which(is.na(hearing_all$hearing_aid_cat)))

step 2: get the ages for hearing aids (3393)

In [None]:
### Extract age for Control (3393)
aid_age_control <- hearing_all %>% 
  filter(hearing_aid_cat == "control") 


#find out the age at the last visit (control)

offset = which(colnames(aid_age_control) == 'f.21003.0.0') - which(colnames(aid_age_control) == 'f.3393.0.0')

aid_age_control$age_aid = apply(aid_age_control, 1, function(x) {
  hear_aid = which(x[grep("f.3393", names(x))] == "No")
  first_index_offset = grep("f.3393", names(x))[1] - 1
  unlist(x[hear_aid[length(hear_aid)] + first_index_offset + offset])
})

res<-head(aid_age_control)

In [None]:
### Extract age for Cases (3393)
aid_age_case <- hearing_all %>% 
  filter(hearing_aid_cat == "case") 

#find out the age at the first visit (case)

offset = which(colnames(aid_age_case) == 'f.21003.0.0') - which(colnames(aid_age_case) == 'f.3393.0.0')

aid_age_case$age_aid = apply(aid_age_case, 1, function(x) {
  hear_aid =  which(x[grep("f.3393", names(x))] == "Yes")
  first_index_offset = grep("f.3393", names(x))[1] - 1
  unlist(x[min(hear_aid) + first_index_offset + offset])
})

res<-head(aid_age_case)

#merge age for cases and controls
hearing_clean <- rbind(aid_age_case, aid_age_control) 
dim(hearing_clean)

## Hearing difficulty/problems (2247)
### step1: classify cases and controls

In [None]:
#
hearing_diff <- hearing_clean %>% 
  mutate(cases = apply(select(.,starts_with("f.2247")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))
  )


hearing_diff$control = with(hearing_diff, ifelse(f.2247.0.0 %in% c("No",NA) & f.2247.1.0 %in% c("No", NA) & f.2247.2.0 %in% c("No",NA) & f.2247.3.0 %in% c("No",NA) 
                                                 & !(f.2247.0.0 %in% c(NA) & f.2247.1.0 %in% c(NA) & f.2247.2.0 %in% c(NA) & f.2247.3.0 %in% c(NA)),"FALSE", NA)) 



#90761 individuals are cases for 2247  (also have hearing aid data)
hearing_diff_cases <- hearing_diff %>% 
  filter(cases == "TRUE")


#125358 individuals are controls for 2247 (also have hearing aid data)
hearing_diff_control <- hearing_diff %>% 
  filter(control == "FALSE") %>% 
  select(-cases)

#216119 individulas who are either case or control for 2247 (also have hearing aid data)
hearing_diff <- hearing_diff %>% 
  filter(cases == "TRUE" | control == "FALSE")


###merge cases and controls

hearing_diff$hearing_diff_cat <- coalesce(hearing_diff$cases, as.logical(hearing_diff$control))

hearing_diff <- hearing_diff %>% 
  select(-cases, -control) %>% 
  mutate(hearing_diff_cat = as.factor(hearing_diff_cat),
         hearing_diff_cat = recode(hearing_diff_cat,"FALSE"="control","TRUE"="case"))

#85 state they have no hearing difficulty but they wear hearing aids 
check_inconsistence <- hearing_diff %>% 
  filter(hearing_diff_cat == "control" & hearing_aid_cat == "case") 

#reclassify these 85 individulas as cases for 2247
hearing_diff <- hearing_diff %>% 
  mutate(hearing_diff_cat_new = case_when(
    hearing_diff_cat == "control" & hearing_aid_cat == "case" ~ "case",
    hearing_diff_cat == "case" & hearing_aid_cat == "case" ~ "case",
    hearing_diff_cat == "case" & hearing_aid_cat == "control" ~ "case",
    hearing_diff_cat == "control" & hearing_aid_cat == "control" ~ "control"))

# 90761 + 85 = 90846 who are cases
check_merge <- hearing_diff %>% 
  filter(hearing_diff_cat_new == "case")

### Step 2: get the ages for Hearing difficulty/problems (2247)
### Extract age for Control (2247)
```{r age for control,warning=FALSE, collapse=FALSE,echo=T}
hearing_diff_age_control <- hearing_diff %>% 
  filter(hearing_diff_cat_new == "control") 

#get the age at last visit for control
offset = which(colnames(hearing_diff_age_control) == 'f.21003.0.0') - which(colnames(hearing_diff_age_control) == 'f.2247.0.0')

hearing_diff_age_control$age_diff = apply(hearing_diff_age_control, 1, function(x) {
  hear_aid = which(x[grep("f.2247", names(x))] == "No")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
})

res<-head(hearing_diff_age_control)

In [None]:
#for those who are cases for hearing difficuty -- get the age at first visit for case
hearing_diff_age_case <- hearing_diff %>% 
  filter(hearing_diff_cat == "case") 

offset = which(colnames(hearing_diff_age_case) == 'f.21003.0.0') - which(colnames(hearing_diff_age_case) == 'f.2247.0.0')

hearing_diff_age_case$age_diff = apply(hearing_diff_age_case, 1, function(x) {
  hear_aid =  which(x[grep("f.2247", names(x))] == "Yes")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[min(hear_aid) + first_index_offset + offset])
})

res<-head(hearing_diff_age_case)

#for those who are control for hearing difficuty (2247) but cases for hearing aid (3393) -- get the age of 2247 at last visit  (85 individuals)

hearing_diff_age_case_2 <- hearing_diff %>% 
 filter(hearing_diff_cat == "control" & hearing_aid_cat == "case")

offset = which(colnames(hearing_diff_age_case_2 ) == 'f.21003.0.0') - which(colnames(hearing_diff_age_case_2) == 'f.2247.0.0')

hearing_diff_age_case_2$age_diff = apply(hearing_diff_age_case_2, 1, function(x) {
  hear_aid =  which(x[grep("f.2247", names(x))] == "No")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
    })


hearing_diff_clean <- rbind(hearing_diff_age_case, hearing_diff_age_control,hearing_diff_age_case_2) 
dim(hearing_diff_clean)

## Hearing difficulty/background noise (2257)
### step1: classify cases and controls

In [None]:
hearing_noise <- hearing_diff_clean %>% 
  mutate(cases = apply(select(.,starts_with("f.2257")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))
  )

hearing_noise$control = with(hearing_noise, ifelse(f.2257.0.0 %in% c("No",NA) & f.2257.1.0 %in% c("No", NA) & f.2257.2.0 %in% c("No",NA) & f.2257.3.0 %in% c("No",NA) 
                                                 & !(f.2257.0.0 %in% c(NA) & f.2257.1.0 %in% c(NA) & f.2257.2.0 %in% c(NA) & f.2257.3.0 %in% c(NA)),"FALSE", NA)) 

#123870 individuals are cases for 2257  (also have hearing aid data + hearing difficulty)
hearing_noise_cases <- hearing_noise %>% 
  filter(cases == "TRUE")


#86775 individuals are controls for 2257 (also have hearing aid data + hearing difficulty)
hearing_noise_control <- hearing_noise %>% 
  filter(control == "FALSE") %>% 
  select(-cases)

#210645 individulas who are either case or control for 2257 (also have hearing aid data)
hearing_noise<- hearing_noise %>% 
  filter(cases == "TRUE" | control == "FALSE")
```


### merge cases and controls (2257)
```{r merge cases and controls,warning=FALSE, collapse=FALSE,echo=T}
#merge the cases and controls for hearing noise (2257)
hearing_noise$hearing_noise_cat <- coalesce(hearing_noise$cases, as.logical(hearing_noise$control))

hearing_noise <- hearing_noise %>% 
  select(-cases, -control) %>% 
  mutate(hearing_noise_cat = as.factor(hearing_noise_cat),
         hearing_noise_cat = recode(hearing_noise_cat,"FALSE"="control","TRUE"="case"))


#60827 inconsistent for hearing_noise and hearing_diff (conflict of cases and control)
check_noise_inconsistence <- hearing_noise %>% 
  filter(hearing_noise_cat != hearing_diff_cat_new)

#149818 left in the study (Individuals who are cases for one field and controls for the other field should be removed from the analysis)  210645 - 60827 = 149818
hearing_noise <- hearing_noise %>% 
  filter(hearing_noise_cat == hearing_diff_cat_new)

```



### Step 2: get the ages for Hearing difficulty/background noise (2257) 
### Extract age for Control (2257)
```{r age for control,warning=FALSE, collapse=FALSE,echo=T}
noise_age_control <- hearing_noise %>% 
  filter(hearing_noise_cat == "control") 

#get the age at last visit for controls
offset = which(colnames(noise_age_control) == 'f.21003.0.0') - which(colnames(noise_age_control) == 'f.2257.0.0')

noise_age_control$age_noise = apply(noise_age_control, 1, function(x) {
  hear_aid = which(x[grep("f.2257", names(x))] == "No")
  first_index_offset = grep("f.2257", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
})

res<-head(noise_age_control)

```


### Extract age for Cases (2257)
```{r age for cases,warning=FALSE, collapse=FALSE,echo=T}
noise_age_case <- hearing_noise %>% 
  filter(hearing_noise_cat == "case") 


#get the age at first visit for cases
offset = which(colnames(noise_age_case) == 'f.21003.0.0') - which(colnames(noise_age_case) == 'f.2257.0.0')

noise_age_case$age_noise = apply(noise_age_case, 1, function(x) {
  hear_aid = which(x[grep("f.2257", names(x))] == "Yes")
  first_index_offset = grep("f.2257", names(x))[1] - 1
  unlist(x[min(hear_aid) + first_index_offset + offset])
})

res<-head(noise_age_case)

hearing_noise_clean <- rbind(noise_age_case, noise_age_control) 



#3252 age inconsistent for 2247 and 2257
age_inconsistence <- hearing_noise_clean %>% 
  filter(age_diff != age_noise) %>% 
  select(IID,FID,f.31.0.0, f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0, f.2257.1.0, f.2257.2.0, f.2257.3.0,age_diff, age_noise, hearing_noise_cat)


#get the minimum age for 2247 and 2257
hearing_noise_clean <- transform(hearing_noise_clean, age_diff_noise = pmin(age_noise, age_diff)) 

#drop unneccessary variables
hearing_noise_clean <- hearing_noise_clean %>% 
  select(-hearing_diff_cat, -age_diff,-age_noise,-age_aid)