# Speech Reception Threshold (SRT) phenotype (quantitative)

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for SRT phenotype to perform association analyses using the LMM.ipynb. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/HI_UKBB
```

## Important phenotypic files

1. `200804_UKBB_HI_genotypeqc.csv` File containing all individuals that passed QC and hearing impairment variables
2. `200804_UKBB_HI_genotypeqc_excr.csv` File with applied exclusion criteria as indicated [here](https://docs.google.com/document/d/1cpxTzElpsEkwmBDjnMBHg2wW7CL1AcG_b0_0wE_k5rQ/edit). **Note**: this file excludes individuals with otosclerosis, Meniere's and other diseases, if you need to filter those particular phenotypes use file 1 instead.
3. `200811_UKBB_Tinnitus_plan1_2_3_f4803` File with filtered phenotypes for tinnitus plan 1,2 and 3 and imputed noise variables
4. `200814_UKBB_HI_genotypeqc_excr_impvars` Database with qc'ed individuals, exclusion criteria, noise imputed vars and tinnitus phenotypes

## SRT analysis plan

**The phenotypes to be used are as follow:**
1. Left ear f.20019
2. Right ear f.20021
3. Best ear (create a new variable extracting the min SRT value among f.20019 and f.20021)
4. Worst ear (create a new variable extracting the max SRT value among f.20019 and f.20021)

**Age is calculated as follows:**

- For people with repeated measures take age at last visit and measurement at last visit
- For people with only one measure take age at that visit

**Sex corresponds to f.22001 (genetic sex):**

- Male = 0
- Female = 1

**Noisy workplace and loud music exposure frequency: same as for Tinnitus**
                
1. Remove inconsistent individuals 
    - said 1,2 or 3 and in following visits said 0
    - said a higher exposure (e.g 3) and then a lower one (e.g 1 or 2) in following visits
2. Retain consistent individuals and use highest reported exposure

**The needs to be inverse normalized**

**Covariates to be included in the analysis include:**

1. Age at time of test (calculated from f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
2. Sex f.22001
3. Volume left f.4270 and right f.4277
4. Noisy workplace f.4825
5. Loud music exposure f.4836

## Load libraries and set working dir

In [None]:
#Load libraries
library(plyr)
library(tidyverse)
library(pander)
library(ggpubr)
library(rapportools)
library(ggplot2)
#Get working directory
getwd()
#Set working directory
setwd('~/project/HI_UKBB')

## Clean workspace

In [None]:
# Clean workspace
rm(list=ls())

## Filter SRT codes

* Obtain scores for right and left ear
* Obtain the age at the last visit

In [None]:
df.final.imp = read.csv('/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/hearing_impairment/200814_UKBB_HI_genotypeqc_excr_impvars', header=TRUE)

In [None]:
#extract the SRT score for left ear and right ear at the last visit for repeaters 

srt_all = df.final.imp %>% 
  select(IID,FID,f.31.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0, f.20019.0.0, f.20019.1.0, f.20019.2.0, f.20019.3.0,f.20021.0.0,f.20021.1.0,f.20021.2.0,f.20021.3.0,f.4270.0.0,f.4270.1.0,f.4270.2.0,f.4270.3.0,f.4277.0.0,f.4277.1.0,f.4277.2.0,f.4277.3.0)


srt_clean <-  srt_all %>% 
  mutate(srt_score_left = apply(select(., starts_with("f.20019")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]}),
         srt_score_right = apply(select(., starts_with("f.20021")), 1, function(x) {tail(x[!is.na(x)],n=1)[1]}),
         srt_age =  apply(select(., starts_with("f.21003")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]}),
         left_volume =  apply(select(., starts_with("f.4270")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]}),
         right_volume =  apply(select(., starts_with("f.4277")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]})
  ) 

In [None]:
#NA for both ear (211763 individuals)
srt_NA <- srt_clean %>% 
  filter(is.na(srt_score_left) & is.na(srt_score_right))
cat("There are",length(srt_NA)," individuals with missing data for both ears.\n")

#only left ear (all left ear - no right ear) 141029 - 1554 = 139475
left_ear <- srt_clean %>% 
  filter(!is.na(srt_score_left))
cat("There are",length(left_ear)," individuals with SRT score for the left ear.\n")

#all people who have right ear score  141030 
right_ear <- srt_clean %>% 
  filter(!is.na(srt_score_right))
cat("There are",length(right_ear)," individuals with SRT score for the right ear.\n")

#1555 who have srt score for right ear but not for left ear
right_ear_na <- srt_clean %>% 
  filter(!is.na(srt_score_right) & is.na(srt_score_left))
cat("There are",length(right_ear_na)," individuals without SRT score for the right ear.\n")

cat("There are",(length(right_ear) - length(right_ear_na))," individuals with SRT score only for the right ear.\n")
#141030 - 1555 = 139475 have only right ear score

#all people who have left ear score  141029
left_ear <- srt_clean %>% 
  filter(!is.na(srt_score_left))
cat("There are",length(left_ear)," individuals with SRT score for the left ear.\n")

#1554 who have srt score for left ear but not for right ear
left_ear_na <- srt_clean %>% 
  filter(!is.na(srt_score_left) & is.na(srt_score_right))
cat("There are",length(left_ear_na)," individuals without SRT score for the left ear.\n")
cat("There are",(length(left_ear) - length(left_ear_na))," individuals with SRT score only for the left ear.\n")

#141029 - 1554 = 139475 have only left ear score

#1555 who do not have srt score for left ear
left_ear_na <- srt_clean %>% 
  filter(is.na(srt_score_left) & !is.na(srt_score_right))

#1554 who do not have srt score for right ear
right_ear_na <- srt_clean %>% 
  filter(is.na(srt_score_right) & !is.na(srt_score_left))

#139475 have SRT score for both ear
both_ear <- srt_clean %>% 
   filter(!is.na(srt_score_left) & !is.na(srt_score_right)) 

#142584 have SRT score for either ear or both ear 
either_ear <- srt_clean %>% 
   filter(!is.na(srt_score_left) | !is.na(srt_score_right))

In [None]:
### Classification for SRT Score 
```{r SRT exploratory (tables),warning=FALSE, collapse=FALSE,echo=T}
#create a categorical variable to classify the score for left and right ear
#138901 have either right or left score, or both  (deleting the outliers)
srt_cat <- srt_clean %>%
  filter(!is.na(srt_score_left) | !is.na(srt_score_right)) %>% 
  mutate(srt_left_cat = case_when(
        srt_score_left < -5.5   ~  "normal",
        -5.5 <= srt_score_left & srt_score_left <= -3.5   ~  "insufficient",
          srt_score_left > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  mutate(srt_right_cat = case_when(
        srt_score_right < -5.5   ~  "normal",
        -5.5 <= srt_score_right & srt_score_right <= -3.5   ~  "insufficient",
          srt_score_right > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  #remove the score 8 (outlier) for both ears
  filter(!(srt_score_left == 8) & !(srt_score_right == 8))

#to use the function max/min, replace all NA to 0 for numerical variables 
srt_fill_na <- srt_cat
srt_fill_na[is.na(clean <- srt_cat)] <- 0
 

# Create two new variables to show the best ear and the worst ear
srt_best_worst <-  srt_fill_na %>% 
  mutate(best = apply(select(., starts_with("f.200")), 1, function(x) min(x[x!=0])),
         worst = apply(select(., starts_with("f.200")), 1, function(x) max(x[x!=0]))
  ) 

#replace 0 with NA
library(dplyr) 
srt_best_worst <- na_if(srt_best_worst, 0)

srt_best_worst <- srt_best_worst %>% 
  #replace the Inf and -Inf values with 0
  mutate(best = if_else(best == Inf, 0, best),
         worst = if_else(worst == -Inf, 0, worst)
  )

## Summary statistics

In [None]:
summary_left <- srt_best_worst %>%
  filter(!is.na(srt_score_left)) %>% 
  summarise(std_left =sd(srt_score_left),
          mean_left = mean(srt_score_left),
          median_left = median(srt_score_left),
          min_left = min(srt_score_left),
          max_left = max(srt_score_left),
          "25%_quantile" = quantile(srt_score_left,probs = 0.25),
          "75%_quantile" = quantile(srt_score_left,probs = 0.75)
          ) %>% 
  knitr::kable(digits = 2,caption = "Summary Table for Left Ear")

summary_left 

summary_right <- srt_best_worst %>%
  filter(!is.na(srt_score_right)) %>% 
  summarise(std_right =sd(srt_score_right),
          mean_right = mean(srt_score_right),
          median_right = median(srt_score_right),
          min_right = min(srt_score_right),
          max_right = max(srt_score_right),
          "25%_quantile" = quantile(srt_score_right,probs = 0.25),
          "75%_quantile" = quantile(srt_score_right,probs = 0.75)
          ) %>% 
  knitr::kable(digits = 2,caption = "Summary Table for Right Ear")

summary_right 


summary_best <- srt_best_worst %>%
  filter(!is.na(best)) %>% 
  summarise(std_best =sd(best),
          mean_best = mean(best),
          median_best = median(best),
          min_best = min(best),
          max_best = max(best),
          "5%_quantile" = quantile(best,probs = 0.05),
          "25%_quantile" = quantile(best,probs = 0.25),
          "75%_quantile" = quantile(best,probs = 0.75),
          "95%_quantile" = quantile(best,probs = 0.95)
          ) %>% 
  knitr::kable(digits = 2,caption = "Summary Table for Best Ear")

summary_best 

summary_worst <- srt_best_worst %>%
  filter(!is.na(worst)) %>% 
  summarise(std_worst =sd(worst),
          mean_worst = mean(worst),
          median_worst = median(worst),
          min_worst = min(worst),
          max_worst = max(worst),
          "5%_quantile" = quantile(worst,probs = 0.05),
          "25%_quantile" = quantile(worst,probs = 0.25),
          "75%_quantile" = quantile(worst,probs = 0.75),
          "95%_quantile" = quantile(worst,probs = 0.95)
          ) %>% 
  knitr::kable(digits = 2,caption = "Summary Table for Worst Ear")

summary_worst

In [None]:
#cases: left ear: top 25%:  < -7.5    right ear:top 25%:  < -7.5 
#controls: left ear: bottom 25%  > -5.5   right ear: bottom 25%  > -5

srt_best_worst <- srt_best_worst %>% 
  filter(!is.na(left_volume) | !is.na(right_volume)) %>% 
   mutate(srt_cat = case_when(
      srt_score_left < -7.5 & srt_score_right < -7.5 ~"control",
      srt_score_left > -5.5 & srt_score_right > -5 ~"case"),
      left_volume = recode(left_volume,"100% (max)"="1","10%"="0.1","20%"="0.2","40%"="0.4","70%"="0.7"),
      right_volume = recode(right_volume,"100% (max)"="1","10%"="0.1","20%"="0.2","40%"="0.4","70%"="0.7")
   )

#25899 who are classified as either case or control
check_case_control <- srt_best_worst %>% 
  filter(!is.na(srt_cat))


srt_best_worst <- srt_best_worst %>% 
  mutate(left_volume = as.numeric(left_volume),
         right_volume =  as.numeric(right_volume)) %>% 
  mutate(mean = rowMeans(.[27:28]))


volume_srt <- srt_best_worst %>% 
  filter(!is.na(srt_cat)) %>% 
   ggplot(aes(x = mean,fill = srt_cat)) +
      geom_histogram(binwidth = 0.1) +
   labs(
    title = "SRT vs Volume (Caes and Controls)",
    x = "Volume",
    y = "Count"
    ) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500),
    labels = c("2500", "5000", "7500", "10000", "12500"))

volume_srt


volume_case <- srt_best_worst %>% 
  filter(srt_cat == "case") %>% 
   ggplot(aes(x = mean)) +
      geom_histogram(binwidth = 0.1) +
   labs(
    title = "SRT vs Volume (Caes)",
    x = "Volume",
    y = "Count"
    ) 

volume_case

volume_control <- srt_best_worst %>% 
  filter(srt_cat == "control") %>% 
   ggplot(aes(x = mean)) +
      geom_histogram(binwidth = 0.1) +
   labs(
    title = "SRT vs Volume (Control)",
    x = "Volume",
    y = "Count"
    ) 

volume_control


library(patchwork)
volume_case/ volume_control

volume_srt

## Plots

In [None]:
#histogram for SRT Score for Left Ear
left_plot = srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(x = srt_score_left,fill = srt_left_cat)) +
      geom_histogram(binwidth = 0.25) +
   labs(
    title = "SRT Score for Left Ear (by category)",
    x = "SRT Score for Left ",
    y = "Count"
    ) +  
   scale_x_continuous(
    breaks = c(-11.25, -5.5, -3.5, 0.5)) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000"))
   scale_color_manual("SRT Score Category",labels = c("normal","insufficient","poor"), values = c("green", "red", "blue")) 
  
#histogram for SRT Score for Right Ear 
right_plot <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(x = srt_score_right, fill=srt_right_cat) ) +
      geom_histogram(binwidth =  0.25) +
  labs(
    title = "SRT Score for Right Ear (by category)",
    x = "SRT Score for Right ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-11.25, -5.5, -3.5, 0.5)) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
   scale_color_manual("SRT Score Category",labels = c("normal","insufficient","poor"), values = c("green", "red", "blue")) 
 


library(patchwork)
left_plot / right_plot


library(nortest)
ad.test(srt_best_worst$srt_score_right)
ad.test(srt_best_worst$srt_score_left)
ad.test(srt_best_worst$best)
ad.test(srt_best_worst$worst)

#QQ plot for SRT Score for Left Ear
qq_left_total <- srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(sample = (srt_score_left - mean(srt_score_left))/sd(srt_score_left))) +
  stat_qq() +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Left)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_left_total

qq_left_by_cat <- srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(sample = (srt_score_left - mean(srt_score_left))/sd(srt_score_left), color = srt_left_cat)) +
  stat_qq() +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Left)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_left_by_cat




#QQ plot for SRT Score for Right Ear
qq_right_total <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(sample = (srt_score_right - mean(srt_score_right))/sd(srt_score_right))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Right)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_right_total

qq_right_by_cat <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(sample = (srt_score_right - mean(srt_score_right))/sd(srt_score_right), color = srt_right_cat)) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Right)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_right_by_cat
```

##Plots for Best and Worst Ear
```{r plots best_worst, warning=FALSE, collapse=FALSE, echo=T}
srt_best_worst_cat <- srt_best_worst %>%
  mutate(srt_best_cat = case_when(
        best < -5.5   ~  "normal",
        -5.5 <= best & best <= -3.5   ~  "insufficient",
          best > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  mutate(srt_worst_cat = case_when(
        worst < -5.5   ~  "normal",
        -5.5 <= worst & worst <= -3.5   ~  "insufficient",
          worst > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  filter(!is.na(best) | !is.na(worst))



best_plot <- srt_best_worst_cat %>% 
  ggplot(aes(x = best, fill = srt_best_cat)) +
      geom_histogram(binwidth = 0.25) +
  labs(
    title = "SRT Score for the Best Ear (by category)",
    x = "SRT Score for the Best ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-11.25, -5.5, -3.5, 0.5)) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
   scale_color_manual("SRT Score Category",labels = c("normal","insufficient","poor"), values = c("green", "red", "blue")) 
 
worst_plot <- srt_best_worst_cat %>% 
  ggplot(aes(x = worst, fill = srt_worst_cat)) +
      geom_histogram(binwidth = 0.25) +
  labs(
    title = "SRT Score for the Worst Ear (by category)",
    x = "SRT Score for the Best ",
    y = "Count"
    ) + 
  scale_x_continuous(
    breaks = c(-11.25, -5.5, -3.5, 0.5)) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
   scale_color_manual("SRT Score Category",labels = c("normal","insufficient","poor"), values = c("green", "red", "blue")) 
 


best_plot/worst_plot


qq_best <- srt_best_worst_cat %>% 
  ggplot(aes(sample = (best - mean(best))/sd(best))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line(col= 'blue') +
  labs(title="QQ plot for the Best SRT Score",
       y = "SRT Score Quantiles",
       x = "normal Quantiles") + 
theme_classic()  

qq_best



qq_worst <- srt_best_worst_cat %>% 
  ggplot(aes(sample = (worst - mean(worst))/sd(worst))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line(col= 'blue') +
  labs(title="QQ plot for the Worst SRT Score",
       y = "SRT Score Quantiles",
       x = "normal quantiles") +
theme_classic()  

qq_worst