# Hearing related phenotypes

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for diverse hearing related phenotyes to perform association analyses using the LMM.ipynb. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/HI_UKBB
```

## Subset the data using variables of interest

Using the ukbconvert software and a list of pre-specified variables

```
./ukbconv ukb42495.enc_ukb r -i/home/dc2325/project/HI_UKBB/selectvars_062520.txt -o/home/dc2325/project/HI_UKBB/ukb42495_subset062520
./ukbconv ukb42495.enc_ukb r -i/home/dc2325/project/HI_UKBB/selectvars_080420.txt -o/home/dc2325/project/HI_UKBB/ukb42495_subset080420
```

In [None]:
[global]
# The working dir
parameter:cwd = path
# The fam file
parameter: famfile = path

## Subsetting individuals with genotypic data

### Load necessary libraries

In [None]:
library(plyr)
library(tidyverse)
library(pander)
library(ggpubr)
library(rapportools)

In [None]:
getwd()

In [2]:
setwd('~/project/HI_UKBB')

In [3]:
# Clean workspace
rm(list=ls())

In [None]:
# Run script to import data to R
source("ukb42495_subset080420.r")
nrow(bd)

In [None]:
# List of individuals with qc'ed genotypic files
df.geno <- read.table("/SAY/dbgapstg/scratch/UKBiobank/genotype_files/pleiotropy_geneticfiles/UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.fam", header= FALSE, stringsAsFactors = FALSE)
names(df.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(df.geno)

In [None]:
head(bd[,1, drop=FALSE])

In [None]:
# Assign individual ID column to bd f.eid
names(bd)[1] <- "IID"
head(bd[,1, drop=FALSE])

In [None]:
# Merge the two data frames
df.gen.phen <-merge(df.geno, bd, by="IID", all=FALSE)
nrow(df.gen.phen)

In [None]:
# Step 5 Save as csv file
write.csv(df.gen.phen,'200804_UKBB_HI_genotypeqc.csv', row.names = FALSE)

## 1. Tinnitus phenotype (binary)

### a. Exclusion criteria based on ICD10, ICD9 codes and self-report
Apply the exclusion criteria defined by the group to remove unwanted individuals. This takes into account ICD10 codes, ICD9 codes and f.20002 (self-report). Please find a list of removed codes [here](https://docs.google.com/spreadsheets/d/12L7Cx4Ov8FppGVmG0DxL9uG-lVRHM5QJSea0nORyirQ/edit#gid=0). A total 12397 individuals were excluded in this step.

In [None]:
# To get a list of removed individuals. Make sure the list with the strings each line has \bstring\b so it can be recognized by -w
cd /home/dc2325/project/HI_UKBB

In [None]:
grep -w -f 200713_ICDcodes_exclusion.txt 200804_UKBB_HI_genotypeqc.csv > 200804_UKBB_excluded_individuals.csv
cat 200804_UKBB_excluded_individuals.csv | wc -l #12397 excluded

In [None]:
# To get the clean db with the included individuals
grep -wv -f 200713_ICDcodes_exclusion.txt 200804_UKBB_HI_genotypeqc.csv > 200804_UKBB_genotypeqc_HI_excr.csv
cat 200804_UKBB_genotypeqc_HI_excr.csv | wc -l #354347 retained
# To obtain the duplicate lines (if they exist)
#comm -12 <(sort 200713_UKBB_genotypeqc_tinnitus_excr.csv) <(sort 200713_UKBB_excluded_individuals.csv)

## Import clean data

In [5]:
df_clean <- read.csv(file = '200804_UKBB_genotypeqc_HI_excr.csv', header=TRUE)

**Analysis plan:**

1. Individuals who currently have tinnitus (all three yes categories) vs no never only controlling for sex, age, noisy workplace and loud music frequency (“crude”) (No tinnitus vs anytype of tinnitus). For this analysis the missing data of the noise variables was imputed using the median for cases and controls separately

3. Individuals in the two top YES categories vs NO never (‘No’ tinnitus vs ‘Yes, now all of the time’ and ‘Yes, now most of the time’)

4. Individuals that say yes in the top 3 categories vs No never (Remove category ‘yes, but not now, but have it in the past’)

## Data summary and recode

In [6]:
dim(df_clean)

### Recode genetic sex f.22001

In [20]:
table(df_clean$f.22001)


Female   Male 
192414 161933 

In [21]:
df_clean$sex <- revalue(df_clean$f.22001.0.0, c("Male" = '0', 'Female'='1' ))
table(df_clean$sex)


     1      0 
192414 161933 

In [22]:
dim(df_clean)

## Filtering out inconsistent cases
Recode 4 instances of field f.4803 with 0,1, and 9
* Yes, now most or all of the time : 1 
* Yes, now a lot of the time: 1 
* Yes, now some of the time: 1 
* Yes, but not now, but have in the past : 1 
* No, never: 0
* Do not know: 9 Prefer not to answer: NA

In [23]:
# Recode function:
recode<-function(df,column_name){
  new_names<-c()
  for (i in column_name){
    new_column_name<-paste0(i,"_recode")
    new_names<-c(new_names,new_column_name)
    df[,new_column_name] <- revalue(df[,i], c("No, never"= 0, 
                                            "Yes, but not now, but have in the past"= 1, 
                                            "Yes, now some of the time"= 1, 
                                            "Yes, now a lot of the time"= 1,
                                            "Yes, now most or all of the time"= 1,
                                            "Prefer not to answer"= NA,
                                            "Do not know"= 9 ))
  }
  return (list(df=df,new_column_names=new_names))
}

# columns needs to be recoded:
column_name<-c("f.4803.0.0","f.4803.1.0","f.4803.2.0","f.4803.3.0")

# get a new data.frame with recoded columns added:
df_clean<-recode(df=df_clean,column_name)$df

# get names of recoded columns:
new_column_names<-recode(df=df_clean,column_name)$new_column_names

# show recode summary:
for (i in new_column_names)
{cat(i,"summary:");print(table(df_clean[,i]));cat("\n")}

f.4803.0.0_recode summary:
    9     0     1 
 1989 80963 32642 

f.4803.1.0_recode summary:
    9     0     1 
  185 10561  4605 

f.4803.2.0_recode summary:
    9     0     1 
  410 23724 11589 

f.4803.3.0_recode summary:
   9    0    1 
  20 1664  726 



In [24]:
dim(df_clean)

## Get patterns for all possible answers

In [25]:
# Extract subset of data only with the recode columns of tinnitus
data_sub <- df_clean %>%
  select("IID",all_of(new_column_names)) 

# Function to extract all the available answers for 4 visits
# and put them in one string as "0000", "111", "991", etc

f2<-function(x){
  visit<-''
  for (i in 2:5){ # do not take the first column (i.e. IID)
    if (!is.na(x[i]))
    {visit<-paste0(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=visit}
  return (visit)
}

# Apply the above function and remove NAs
data_sub$visit<-apply(data_sub, 1, f2)

names(table(data_sub$visit))

In [26]:
all_pattern<-unique(data_sub$visit)

cat("There are",length(all_pattern),"different combinations of answers.\n")

There are 52 different combinations of answers.


In [27]:
cat("Those with 4 numbers has answer for all 4 visits. e.g. '0000': No, No, No, No.\n")

Those with 4 numbers has answer for all 4 visits. e.g. '0000': No, No, No, No.


In [28]:
cat("Those with 3 numbers has answer for 3 answers out of 4 visits. e.g. '001': No, No, Yes. \n")

Those with 3 numbers has answer for 3 answers out of 4 visits. e.g. '001': No, No, Yes. 


In [29]:
cat("Those with 2 numbers has answer for 2 answers out of 4 visits. e.g. '90': Do not know, No. \n")

Those with 2 numbers has answer for 2 answers out of 4 visits. e.g. '90': Do not know, No. 


In [30]:
cat("The first one, i.e. '' is for NAs.\n")

The first one, i.e. '' is for NAs.


In [31]:
table(data_sub$visit)


            0     00    000   0000   0001    001   0010   0011    009     01 
207944  87438  10425   1475     73      4    122      3      3     10   1717 
   010   0100    011   0111    019     09    090    091    099      1     10 
    78      3    131      6      2     94     10      5      2  36961   1014 
   100   1000    101   1010   1011     11    110   1100   1101    111   1110 
    97      6     42      2      2   3717     59      3      5    430      4 
  1111    119     19    190    191   1919    199      9     90    900    901 
    19      4     76      4      8      1      1   2110     92     10      5 
  9011    909     91    911     99    990    991    999 
     1      3     85      9     28      1      2      1 

## Extract consistent cases

In [89]:
all_no<-all_pattern[which(grepl("0",all_pattern) & !grepl("9",all_pattern) & !grepl("1",all_pattern))]
all_yes<-all_pattern[which(!grepl("0",all_pattern) & !grepl("9",all_pattern) & grepl("1",all_pattern))]
all_do_not_know<-all_pattern[which(!grepl("0",all_pattern) & grepl("9",all_pattern) & !grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%union(union(all_no,all_yes),all_do_not_know),"visit"])


    0    00   000  0000     1    11   111  1111     9    99   999 
87438 10425  1475    73 36961  3717   430    19  2110    28     1 

In [90]:
consistent_1<-c(all_no,all_yes,all_do_not_know)

# might be inconsistent cases:
do_not_know_no<-all_pattern[which(grepl("0",all_pattern) & grepl("9",all_pattern) & !grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%do_not_know_no,"visit"])


009  09 090 099  90 900 909 990 
 10  94  10   2  92  10   3   1 

In [91]:
do_not_know_yes<-all_pattern[which(!grepl("0",all_pattern) & grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%do_not_know_yes,"visit"])


 119   19  191 1919  199   91  911  991 
   4   76    8    1    1   85    9    2 

In [92]:
yes_no<-all_pattern[which(grepl("0",all_pattern) & !grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%yes_no,"visit"])


0001  001 0010 0011   01  010 0100  011 0111   10  100 1000  101 1010 1011  110 
   4  122    3    3 1717   78    3  131    6 1014   97    6   42    2    2   59 
1100 1101 1110 
   3    5    4 

In [93]:
with_all_three<-all_pattern[which(grepl("0",all_pattern) & grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%with_all_three,"visit"])


 019  091  190  901 9011 
   2    5    4    5    1 

In [94]:
might_inconsistent<-c(do_not_know_no,do_not_know_yes,yes_no,with_all_three)

## Remove inconsistent cases and NAs

In [95]:
# get all the inconsistent cases:
exceptions<-c("91","911","991","0001","001","0011","01","011","0111", "091", "901", "9011")
inconsistent<-setdiff(might_inconsistent,exceptions)

cat("There are",length(which(is.empty(data_sub$visit))),"NAs, removed")

There are 207944 NAs, removed

In [96]:
cat("There are",length(which(data_sub$visit%in%inconsistent)),"inconsistent cases, removed")

There are 1636 inconsistent cases, removed

In [97]:
# removed those NAs and inconsistent cases: 
IID_cleaned<-data_sub %>%
  filter(!is.empty(visit)) %>%
  filter((!visit%in%inconsistent)) %>%
  select(IID)

dim(IID_cleaned)

In [98]:
IID_cleaned<-IID_cleaned[,1]
cat("After removing all the NAs and inconsistent cases, there are",length(IID_cleaned),"cases left")

After removing all the NAs and inconsistent cases, there are 144767 cases left

In [99]:
# get the dataset after cleansing: 
data_cleaned<-df_clean %>%
  filter(IID%in%IID_cleaned)

dim(data_cleaned)

## Fitlering tinnitus for different analyses

Recode 4 instances of field f.4803 with 4,3,2,1,0, and 9

* Yes, now most or all of the time : 4 
* Yes, now a lot of the time: 3 
* Yes, now some of the time: 2 
* Yes, but not now, but have in the past : 1 
* No, never: 0
* Do not know: 9 
* Prefer not to answer: NA

## Filtering for analysis plan 1

In [100]:
# define cases and control for specific analysis:
all_answers<-c("0","1","2","3","4","9")
control<-c('0')
cases<-c("1","2","3","4")

f3<-function(x){
  get_rid_of<-setdiff(all_answers,union(cases,control))

  for (i in get_rid_of){
  if (grepl(i,x[which(colnames(data_sub)=="visit")])){
    removed=TRUE
    break
  }
  else{removed=FALSE}
  }
  return (removed)
}
data_p1<-data_sub
data_p1$removed<-apply(data_p1, 1, f3)

# filter 
data_p1<-data_p1 %>%
  filter(removed==FALSE)
head(data_p1,40)

Unnamed: 0_level_0,IID,f.4803.0.0_recode,f.4803.1.0_recode,f.4803.2.0_recode,f.4803.3.0_recode,visit,removed
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<chr>,<lgl>
1,1000019,,,,,,False
2,1000022,,,,,,False
3,1000035,,,,,,False
4,1000046,,,0.0,,0.0,False
5,1000054,,,,,,False
6,1000063,0.0,,,,0.0,False
7,1000078,,0.0,0.0,,0.0,False
8,1000105,1.0,,,,1.0,False
9,1000112,,,0.0,,0.0,False
10,1000129,,,,,,False


In [101]:
# change all "1", "2" "3" and "4" into "1"
data_p1$visit<-gsub("1|2|3|4","1",data_p1$visit) 
table(data_p1$visit)


            0     00    000   0000   0001    001   0010   0011     01    010 
207944  87438  10425   1475     73      4    122      3      3   1717     78 
  0100    011   0111      1     10    100   1000    101   1010   1011     11 
     3    131      6  36961   1014     97      6     42      2      2   3717 
   110   1100   1101    111   1110   1111 
    59      3      5    430      4     19 

In [79]:
names(table(data_p1$visit))

In [80]:
# get the dataset for plan 1:
IID_plan1<-data_p1$IID
data_p1<-data_cleaned[which(data_cleaned$IID%in%IID_plan1),]
dim(data_p1)

In [104]:
head(data_p1[,new_column_name],40)

ERROR: Error in `[.data.frame`(data_p1, , new_column_name): object 'new_column_name' not found


In [None]:
write.csv(data_plan1, 'UKBB_HI_genotypeqc_data_plan1.csv', row.names = FALSE)

In [82]:
table(data_p1$f.4803.3.0_recode)


   9    0    1 
   0 1451  684 

### Binary variable recoding

In [None]:
## Variable recoding
binary.rc <- c("No"="No",
               "Yes"="Yes",
               "Prefer not to answer"= NA,
               "Do not know"= NA)

In [None]:
table(data_plan1$f.4803.0.0_recode)

Recode tinnitus to a binary variable f.4803

In [None]:
data_plan1$f.4803.0.0_recode <- revalue(data_plan1$f.4803.0.0_recode, binary.rc)
data_plan1$f.4803.1.0_recode <- revalue(data_plan1$f.4803.1.0_recode, binary.rc)
data_plan1$f.4803.2.0_recode <- revalue(data_plan1$f.4803.2.0_recode, binary.rc)
data_plan1$f.4803.3.0_recode <- revalue(data_plan1$f.4803.3.0_recode, binary.rc)
table(data_plan1$f.4803.0.0_recode)

### c. Filtering of the tinnitus phenotype

Define cases with tinnitus where all yes categories are considered cases

In [102]:
data_p1$cases <- with(data_p1, ifelse(f.4803.0.0_recode == 0 & (f.4803.1.0_recode == 1 | f.4803.2.0_recode == 1 | f.4803.3.0_recode == 1)   
                                   | f.4803.0.0_recode %in% c(1,NA) & (f.4803.1.0_recode %in% c(1,NA) | f.4803.2.0_recode %in% c(1,NA) | f.4803.3.0_recode %in% c(1,NA))
                                  & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA))
                                      ,1, NA))

Number of cases

In [103]:
table(data_p1$cases)


    1 
64367 

Define controls (no tinnitus)

In [87]:
data_p1$controls <- with(data_p1, ifelse(f.4803.0.0_recode %in% c(0,NA) & f.4803.1.0_recode %in% c(0, NA) & f.4803.2.0_recode %in% c(0,NA) &f.4803.3.0_recode %in% c(0,NA)
                                         & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA)) ,0, NA))

Number of controls

In [88]:
table(data_p1$controls)


    0 
99411 

Create a column with the binary status for tinnitus of the individuals

In [None]:
data_plan1$tinnitus <- coalesce(data_plan1$cases, data_plan1$controls)
table(data_plan1$tinnitus)

In [None]:
# Get the number of NAs
length(which(is.na(data_plan1$tinnitus)))

### d. Obtaining the age for tinnitus cases and controls

Get the "age at onset" of tinnitus using f.21003 Age when attended assessment centre for each of the instances
For cases first time they replied yes to f.4803

In [None]:
# Get the subset of data to extract age
age_all = df %>% 
  filter(!is.na(tinnitus)) %>%
  select(IID,tinnitus, f.4803.0.0.rc, f.4803.1.0.rc, f.4803.2.0.rc, f.4803.3.0.rc, f.21003.0.0, f.21003.1.0, f.21003.2.0, f.21003.3.0)  # data_filed 210003: Age when attended assessment centre
head(age_all)

In [None]:
# Get the subset data of cases
age_cases = age_all %>% 
  filter(tinnitus=="Yes")  %>%
  select(IID,f.4803.0.0.rc,f.4803.1.0.rc,f.4803.2.0.rc,f.4803.3.0.rc,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
head(age_cases,12)

In [None]:
# Get the # of column where first replied Yes:
age_cases$visit_idx = apply(age_cases, 1, function(x) unlist(which(x == 'Yes')))
head(age_cases)

In [None]:
# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.4803.0.0.rc)
offset = which(colnames(age_cases) == 'f.21003.0.0') - which(colnames(age_cases) == 'f.4803.0.0.rc')

# Define the function to extract the first time they said yes for cases 
f=get_age_func <- function(x) {
  visit_index=x[which(colnames(age_cases)=="visit_idx")]
  index=min(unlist(visit_index))+offset
  age=x[index]
  final_age=unlist(age)
  if(is.null(final_age))
  {final_age<-NA}
  return(final_age)
}

# Get the final age for cases
age_cases$age_final = apply(age_cases, 1, f)

# Show first 6 rows
head(age_cases)
summary(age_cases$age_final)

Get the age for controls last time they replied no to f.4803

In [None]:
# Get the subset data of controls
age_control = age_all %>% 
  filter(tinnitus=="No")  %>%
  select(IID,f.4803.0.0.rc,f.4803.1.0.rc,f.4803.2.0.rc,f.4803.3.0.rc,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
head(age_control,12)

In [None]:
# Get the # of column where last replied No:
age_control$visit_idx = apply(age_control, 1, function(x) unlist(which(x == 'No')))

# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.4803.0.0.rc)
offset = which(colnames(age_control) == 'f.21003.0.0') - which(colnames(age_control) == 'f.4803.0.0.rc')

# Define the function to extract the last time they said no for control

f=get_age_func <- function(x) {
  visit_index=x[which(colnames(age_control)=="visit_idx")]
  index=max(unlist(visit_index))+offset
  age=x[index]
  age=unlist(age)
  return(age)
}

# Get the final age for controls
age_control$age_final = apply(age_control, 1, f)

# Show first 6 rows
head(age_control)
summary(age_control$age_final)

In [None]:
# Merge age_cases and age_controls
age_tinnitus <- rbind(age_cases, age_control) 
dim(age_tinnitus)
head(age_tinnitus)

In [None]:
#Merge with complete database keep the all the rows from original db
df.age = merge(x = df,y = age_tinnitus[,c("IID","age_final")],by="IID", all.x=TRUE)
dim(df.age)

### Recode noisy workplace f.4825 and loud music frequency exposure f.4836

In [None]:
# Recode function:
recode_ordinal<-function(df,column_name){
  new_names<-c()
  for (i in column_name){
    new_column_name<-paste0(i,"_recode")
    new_names<-c(new_names,new_column_name)
    df[,new_column_name] <- revalue(df[,i], c("No"= "0", 
                                            "Yes, for less than a year"= "1", 
                                            "Yes, for around 1-5 years"= "2", 
                                            "Yes, for more than 5 years"= "3",
                                            "Prefer not to answer"= NA,
                                            "Do not know"= NA ))
    df[,new_column_name] <- ordered(df[,new_column_name] , c("0", "1", "2", "3"))
  }
  return (list(df=df,new_column_names=new_names))
}
# columns needs to be recoded:
column_name<-c("f.4825.0.0", "f.4825.1.0", "f.4825.2.0", "f.4825.3.0","f.4836.0.0", "f.4836.1.0", "f.4836.2.0", "f.4836.3.0")
# get a new data.frame with recoded columns added:
data_cleaned<-recode_ordinal(df=data_cleaned,column_name)$df
# get names of recoded columns:
new_column_names<-recode_ordinal(df=data_cleaned,column_name)$new_column_names
# show recode summary:
for (i in new_column_names)
{cat(i,"summary:");print(table(data_cleaned[,i]));cat("\n")}

### Checking consistency of the f.4825 noisy workplace and filtering

In [None]:
# Extract subset of data only with the recode columns of noisy workplace variable
data_noise <- data_cleaned %>%
  select(IID, "f.4825.0.0", "f.4825.1.0", "f.4825.2.0", "f.4825.3.0") >%>
    
dim(data_noise)

In [None]:
# Function to extract all the available answers for 4 visits
# and put them in one list
f<-function(x){
  visit<-c()
  for (i in 2:5){
    if (!is.na(x[i]))
    {visit<-c(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=as.numeric(visit)}
  return (visit)
}

# Apply the above function and remove NAs
data_noise$visit<-apply(data_noise, 1, f)

In [None]:
data_noise<-data_noise %>%
  filter(!is.na(visit)) 
head(data_noise)
dim(data_noise)

In [None]:
# Function to get the final code for noise_wp
f<-function(x){
  l=length(x$visit)
  if (l==1){ # only one answer available
    result=x$visit
  }
  else{ # more then one answer available
    result=x$visit[1]
    for (i in 2:l){
      if (x$visit[i] >= x$visit[i-1]){result=x$visit[i]} # consistent ones
      else {result=NA; break} # inconsistent ones
    }
  }
  return(result)
}

# Apply the above function and remove NAs
data_noise$noise_wp<-apply(data_noise, 1, f)
data_noise<-data_noise %>%
  filter(!is.na(noise_wp)) 
head(data_noise, 12) # note: noise_wp code generated here is numeric, not factor

In [None]:
# Append the noise variable to the data
df.noise = merge(x = df.age,y = data_noise[,c("IID", "noise_wp")],by="IID", all.x=TRUE)
dim(df.noise)

### Checking consistency of the f.4836 loud music exposure frequency and filtering

In [None]:
# Extract subset of data only with the recode columns of loud music exposure variable f.4836
data_music <-  df.age %>%
  select(IID,f.4836.0.0.rc,f.4836.1.0.rc,f.4836.2.0.rc,f.4836.3.0.rc) 
head(data_music)
dim(data_music)

In [None]:
# Function to extract all the available answers for 4 visits
# and put them in one list

f<-function(x){
  visit<-c()
  for (i in 2:5){
    if (!is.na(x[i]))
    {visit<-c(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=as.numeric(visit)}
  return (visit)
}

# Apply the above function and remove NAs
                              
data_music$visit<-apply(data_music, 1, f)
data_music<-data_music %>%
  filter(!is.na(visit)) 
head(data_music)
dim(data_music)

In [None]:
# Function to get the final code for "loud_music"
f<-function(x){
  l=length(x$visit)
  if (l==1){ # only one answer available
    result=x$visit
  }
  else{ # more then one answer available
    result=x$visit[1]
    for (i in 2:l){
      if (x$visit[i] >= x$visit[i-1]){result=x$visit[i]} # consistent ones
      else {result=NA; break} # inconsistent ones
    }
  }
  return(result)
}

# Apply the above function and remove NAs
data_music$loud_music<-apply(data_music, 1, f)
data_music<-data_music %>%
  filter(!is.na(loud_music)) 
head(data_music, 12) # note: loud_music code generated here is numeric, not factor
dim(data_music)

In [None]:
# Merge all of the variables in the final dataset
df.final = merge(x = df.noise,y = data_music[,c("IID", "loud_music")],by="IID", all.x=TRUE)
dim(df.final)  

In [None]:
# Recode function:
recode<-function(df,column_name){
  new_names<-c()
  for (i in column_name){
    new_column_name<-paste0(i,"_recode")
    new_names<-c(new_names,new_column_name)
    df[,new_column_name] <- revalue(df[,i], c("No, never"= 0, 
                                            "Yes, but not now, but have in the past"= 1, 
                                            "Yes, now some of the time"= 2, 
                                            "Yes, now a lot of the time"= 3,
                                            "Yes, now most or all of the time"= 4,
                                            "Prefer not to answer"= NA,
                                            "Do not know"= 9 ))
  }
  return (list(df=df,new_column_names=new_names))
}

# columns needs to be recoded:
column_name<-c("f.4803.0.0","f.4803.1.0","f.4803.2.0","f.4803.3.0")

# get a new data.frame with recoded columns added:
df_recode<-recode(df=data_cleaned,column_name)$df

# get names of recoded columns:
new_column_names<-recode(df=data_cleaned,column_name)$new_column_names

# show recode summary:
for (i in new_column_names)
{cat(i,"summary:");print(table(df_recode[,i]));cat("\n")}

## Get patterns for all visits

In [None]:
# Extract subset of data only with the recode columns of tinnitus
data_sub <- df_recode %>%
  select("IID",all_of(new_column_names)) 

# Function to extract all the available answers for 4 visits
# and put them in one string as "0000", "111", "991", etc

f2<-function(x){
  visit<-''
  for (i in 2:5){ # do not take the first column (i.e. IID)
    if (!is.na(x[i]))
    {visit<-paste0(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=visit}
  return (visit)
}

# Apply the above function and remove NAs
data_sub$visit<-apply(data_sub, 1, f2)

names(table(data_sub$visit))

In [None]:
all_pattern <- unique(data_sub$visit)

cat("There are",length(all_pattern),"different combinations of answers.\n")

In [None]:
table(data_sub$visit)

## Filtering for analysis plan 2

In [None]:
# define cases and control for specific analysis:
all_answers<-c("0","1","2","3","4","9")
control<-c('0')
cases<-c("3","4")

f3<-function(x){
  get_rid_of<-setdiff(all_answers,union(cases,control))

  for (i in get_rid_of){
  if (grepl(i,x[which(colnames(data_sub)=="visit")])){
    removed=TRUE
    break
  }
  else{removed=FALSE}
  }
  return (removed)
}
data_sub2<-data_sub
data_sub2$removed<-apply(data_sub2, 1, f3)

# filter 
data_sub2<-data_sub2 %>%
  filter(removed==FALSE)
head(data_sub2,40)

In [None]:
# change all "3" into "4"
data_sub2$visit<-gsub("3","4",data_sub2$visit) 
table(data_sub2$visit)

In [None]:
names(table(data_sub2$visit))

In [None]:
# get the dataset for plan 2:
IID_plan2<-data_sub2$IID
data_plan2<-data.geno.pheno[which(data.geno.pheno$IID%in%IID_plan2),]
dim(data_plan2)

In [None]:
head(data_plan2[,column_name],40)

In [None]:
write.csv(data_plan2, 'UKBB_HI_genotypeqc_data_plan2.csv', row.names = FALSE)

## Filtering for analysis plan 3

In [None]:
# define cases and control for specific analysis:
all_answers<-c("0","1","2","3","4","9")
control<-c('0')
cases<-c("2","3","4")

data_sub3<-data_sub
data_sub3$removed<-apply(data_sub3, 1, f3)

# filter 
data_sub3<-data_sub3 %>%
  filter(removed==FALSE)
head(data_sub3,40)

In [None]:
# change all "2" and "3" into "4"
data_sub3$visit<-gsub("2","4",data_sub3$visit) 
data_sub3$visit<-gsub("3","4",data_sub3$visit) 
table(data_sub3$visit)

In [None]:
names(table(data_sub3$visit))

In [None]:
IID_plan3<-data_sub3$IID
data_plan3<-df_clean[which(df_clean$IID%in%IID_plan3),]
dim(data_plan3)

In [None]:
head(data_plan3[,column_name],40)

In [None]:
write.csv(data_plan3, 'UKBB_HI_genotypeqc_data_plan3.csv', row.names = FALSE)

### g. Exporting the final phenotype file Tinnitus with complete cases for association analyses

In [None]:
# Last renaming and recoding
df.final$tinnitus <- revalue(df.final$tinnitus, c("No" = '0', 'Yes'='1' ))
names(df.final)[names(df.final) == "age_final"] <- "age"

# Creating the file for subsequent association analyses

tinnitus_cc <- df.final %>%
  filter(!is.na(tinnitus)) %>%
  select(FID, IID, age, sex, tinnitus, noise_wp, loud_music)
head(tinnitus_cc)
dim(tinnitus_cc)

In [None]:
length(which(is.na(tinnitus_cc$noise_wp)))

In [None]:
length(which(is.na(tinnitus_cc$loud_music)))

In [None]:
length(which(is.na(tinnitus_cc$loud_music) & is.na(tinnitus_cc$noise_wp)))

In [None]:
length(which(is.na(tinnitus_cc$loud_music) | is.na(tinnitus_cc$noise_wp))) #noise variables missing cases

In [None]:
# Export to file in correct format 
# Contains all tinnitus cases, noise variables some are NA
write.table(tinnitus_cc, '200716_UKBB_Tinnitus_f4803', quote = FALSE, row.names = FALSE)

In [None]:
tinnitus_complete <- df.final %>%
  filter(!is.na(tinnitus) & !is.na(noise_wp) & !is.na(loud_music)) %>%
  select(FID, IID, age, sex, tinnitus, noise_wp, loud_music)
head(tinnitus_complete)
dim(tinnitus_complete)

In [None]:
# Export file tinnitus complete cases (no missing data for noise variables)
write.table(tinnitus_complete,'200722_UKBB_Tinnitus_f4803_completecases', quote = FALSE, row.names = FALSE)

In [None]:
# Export complete database to extract the rest of phenotypes
write.csv(df.final,'200716_UKBB_HI_genotypeqc.csv', row.names = FALSE)

In [None]:
# Import tinnitus complete
tinnitus_complete <- read.table('200722_UKBB_Tinnitus_f4803_completecases',  header=TRUE)
head(tinnitus_complete)

### h. Imputation of noise variables using average

Calculate mean and median for the entire sample (n=142,082)

In [None]:
mean(tinnitus_cc$noise_wp, na.rm=TRUE)
median(tinnitus_cc$noise_wp, na.rm=TRUE)

Calculate the mean and median in cases and controls

In [None]:
mean(subset(tinnitus_cc, tinnitus == 1)$noise_wp, na.rm=TRUE)
median(subset(tinnitus_cc, tinnitus == 1)$noise_wp, na.rm=TRUE)

In [None]:
mean(subset(tinnitus_cc, tinnitus == 0)$noise_wp, na.rm=TRUE)
median(subset(tinnitus_cc, tinnitus == 0)$noise_wp, na.rm=TRUE)

Impute values of noisy workplace using the median

In [None]:
noise.imp <-df.final %>% 
  filter(!is.na(tinnitus)) %>%
  select(IID, tinnitus, noise_wp) %>%
  group_by(tinnitus) %>%
  mutate(noise_wp=ifelse(is.na(noise_wp),median(noise_wp,na.rm=TRUE),noise_wp))
noise.imp.fin <- rename(noise.imp, noise_imp = noise_wp)
head(noise.imp.fin, 10)

Impute values for loud music exposure using then median

In [None]:
music.imp <-df.final %>% 
  filter(!is.na(tinnitus)) %>%
  select(IID, tinnitus, loud_music) %>%
  group_by(tinnitus) %>%
  mutate(loud_music=ifelse(is.na(loud_music),median(loud_music,na.rm=TRUE),loud_music))
music.imp.fin <- rename(music.imp, music_imp = loud_music )
head(music.imp.fin,10)

Merge with complete database

In [None]:
df.noise.imp <- merge(x = df.final,y = noise.imp.fin[,c("IID","noise_imp")],by="IID", all.x=TRUE)

In [None]:
df.final.imp <- merge(x = df.noise.imp,y = music.imp.fin[,c("IID", "music_imp")],by="IID", all.x=TRUE)
dim(df.final.imp)

In [None]:
tinnitus.imp <- df.final.imp %>%
  filter(!is.na(tinnitus)) %>%
  select(FID, IID, age, sex, tinnitus, noise_imp, music_imp)
head(tinnitus.imp)
dim(tinnitus.imp)

In [None]:
write.table(tinnitus.imp, '200720_UKBB_Tinnitus_f4803_noise_imp', quote = FALSE, row.names = FALSE)
length(which(is.na(tinnitus.imp$noise_imp)))
length(which(is.na(tinnitus.imp$music_imp)))

In [None]:
write.csv(df.final.imp, '200720_UKBB_HI_genotypeqc_noise_imp.csv', row.names = FALSE)

### i. Descriptive statistics

Are there differences in the sex proportions of tinnitus cases?

In [None]:
female.total = length(which(tinnitus_complete$sex=="1"))
male.total = length(which(tinnitus_complete$sex=="0"))
people.total= c(male.total, female.total)
people.total # Number of male and female in the sample

In [None]:
female.cases = length(which(tinnitus_complete$tinnitus == "1" & tinnitus_complete$sex == "1"))
male.cases =length(which(tinnitus_complete$tinnitus == "1" & tinnitus_complete$sex == "0")) 
people.cases = c(male.cases, female.cases)
people.cases # Number of male and female cases
prop.test(people.cases, people.total) # Proportion of male and female cases in the total sample are different

In [None]:
female.controls = length(which(tinnitus_complete$tinnitus == "0" & tinnitus_complete$sex == "1"))
male.controls = length(which(tinnitus_complete$tinnitus == "0" & tinnitus_complete$sex == "0"))
people.controls = c(male.controls, female.controls)
people.controls # Number of male and female controls
prop.test(people.controls, people.total) # Proportion of male and female controls differ

Are there differences in the sex proportions and noise variables?

In [None]:
sex.noise=table(tinnitus_complete$noise_wp, tinnitus_complete$sex)
sex.noise
chisq.test(sex.noise) # Noise at workplace is not independent of sex

In [None]:
sex.music = table(tinnitus_complete$loud_music, tinnitus_complete$sex)
sex.music
chisq.test(sex.music) # Loud music frequency exposure is not independent of sex

Barplot sex vs noisy workplace

In [None]:
noise.plot <- ggplot(tinnitus_complete, aes(noise_wp)) +
  geom_bar(aes(y=..count../sum(..count..), fill = sex), position = "dodge") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Barplot of sex by noisy workplace",
       x = "Noisy workplace", 
       y = "Proportion of individuals")
noise.plot 

Barplot sex vs loud music exposure

In [None]:
music.plot <- ggplot(tinnitus_complete, aes(loud_music)) +
  geom_bar(aes(y=..count../sum(..count..), fill = sex), position = "dodge") +
  scale_y_continuous(labels = scales::percent) +
labs(title = "Barplot of sex by loud music exposure",
       x = "Loud music exposure", 
       y = "Proportion of individuals")
music.plot

In [None]:
label_sex <- c("1" = "Female", "0" = "Male")
label_tin <- c("1" = "Cases", "0" = "Controls")
sex.tin.noise <- ggplot(tinnitus_complete, aes(x = noise_wp)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  labs(title = "Tinnitus, sex and noisy workplace",
       x = "Noisy workplace", 
       y = "Proportion of individuals") +
  scale_y_continuous(labels = scales::percent, name = "Proportion") +
  facet_grid(tinnitus ~ sex, labeller=labeller(sex = label_sex, tinnitus = label_tin)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
sex.tin.noise

In [None]:
sex.tin.music <- ggplot(tinnitus_complete, aes(x = loud_music)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
    labs(title = "Tinnitus, sex and loud music frequency exposure",
       x = "Loud music frequency exposure", 
       y = "Proportion of individuals") +
  scale_y_continuous(labels = scales::percent, name = "Proportion") +
  facet_grid(tinnitus ~ sex, labeller=labeller(sex = label_sex, tinnitus = label_tin)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
sex.tin.music

In [None]:
tinnitus_complete %>%
select(age, noise_wp) %>%
group_by(noise_wp) %>%
  summarise(
    count = n(),
    mean = mean(age, na.rm = TRUE),
    sd = sd(age, na.rm = TRUE))

In [None]:
noise.boxp <- ggboxplot(tinnitus_complete, x = "noise_wp", y = "age", 
          color = "noise_wp",
          order = c("0", "1", "2", "3"),
          ylab = "Age", xlab = "noisy workplace")
noise.boxp

In [None]:
res1.aov <- aov(age ~ noise_wp, data = tinnitus_complete)
summary(res1.aov)
TukeyHSD(res1.aov)

In [None]:
music.boxp <- ggboxplot(tinnitus_complete, x = "loud_music", y = "age", 
          color = "loud_music",
          order = c("0", "1", "2", "3"),
          ylab = "Age", xlab = "Loud music frequency")
music.boxp

In [None]:
res2.aov <- aov(age ~ loud_music, data = tinnitus_complete)
summary(res2.aov)
TukeyHSD(res2.aov)

Correlation between the two noise variables for cases and controls separately using Kendall correlation

In [None]:
cases <- tinnitus_complete %>%
  filter(tinnitus=="1")
noise_wp <- as.numeric(factor(cases$noise_wp,levels=c("0", "1", "2", "3"))) 
loud_music <- as.numeric(factor(cases$loud_music, levels=c("0", "1", "2", "3"))) 
m <- cbind(noise_wp, loud_music)
cor(m, method="kendall", use="pairwise")
cor.test(noise_wp, loud_music, method="kendall")

In [None]:
controls <- tinnitus_complete %>%
  filter(tinnitus=="0")
noise_wp.ctr <- as.numeric(factor(controls$noise_wp,levels=c("0", "1", "2", "3"))) 
loud_music.ctr <- as.numeric(factor(controls$loud_music, levels=c("0", "1", "2", "3"))) 
m1 <- cbind(noise_wp.ctr, loud_music.ctr)
cor(m1, method="kendall", use="pairwise")
cor.test(noise_wp.ctr, loud_music.ctr, method="kendall")

Heat map for variable correlation

In [None]:
#Heat map
noisevar.df <- as.data.frame(table(tinnitus_complete$noise_wp, tinnitus_complete$loud_music))
ggplot(noisevar.df, aes(Var1, Var2)) +
  geom_tile(aes(fill = Freq), colour = "black") +
  scale_fill_gradient(low = "white", high = "steelblue")

Fluctuation plot

In [None]:
# Fluctuation plot
theme_nogrid <- function (base_size = 12, base_family = "") {
  theme_bw(base_size = base_size, base_family = base_family) %+replace% 
    theme(panel.grid = element_blank())   
}
ggplot(noisevar.df, aes(Var1, Var2)) +
  geom_point(aes(size = Freq, color = Freq, stat = "identity", position = "identity"), shape = 15) +
  scale_size_continuous(range = c(3,15)) + 
  scale_color_gradient(low = "white", high = "black") +
  theme_nogrid()

## 2. SRT phenotype (quantitative)

The phenotypes to be used are as follow:
1. Left ear f.20019
2. Right ear f.20021
3. Best ear (create a new variable extracting the min SRT value among f.20019 and f.20021)
4. Worst ear (create a new variable extracting the max SRT value among f.20019 and f.20021)

Age is calculated as follow:

- For people with repeated measures take age at last visit and measurement at last visit
- For people with only one measure take age at that visit

Sex corresponds to f.22001 (genetic sex):

- Male = 0
- Female = 1

Noise variable and loud music exposure frequency: same as for Tinnitus
                
1. Remove inconsistent individuals 
    - said 1,2 or 3 and in following visits said 0
    - said a higher exposure (e.g 3) and then a lower one (e.g 1 or 2) in following visits
2. Retain consistent individuals and use highest reported exposure

### a. Filter SRT codes
Obtain scores for right and left ear

Obtain the age at the last visit

In [None]:
df.final.imp = read.csv('/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/hearing_impairment/200720_UKBB_HI_genotypeqc_noise_imp.csv', header=TRUE)

In [None]:
setwd('/home/dc2325/project/HI_UKBB/SRT')

In [None]:
srt_all = df.final.imp %>% 
  select(IID,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0, f.20019.0.0, f.20019.1.0, f.20019.2.0, f.20019.3.0,f.20021.0.0,f.20021.1.0,f.20021.2.0,f.20021.3.0)

srt_clean <-  srt_all %>% 
  mutate(srt_score_left = apply(select(., starts_with("f.20019")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]}),
         srt_score_right = apply(select(., starts_with("f.20021")), 1, function(x) {tail(x[!is.na(x)],n=1)[1]}),
         srt_age =  apply(select(., starts_with("f.21003")), 1, function(x)  {tail(x[!is.na(x)],n=1)[1]})
  )

head(srt_clean)

### e. Descriptive statistics

In [None]:
#1555 who do not have srt score for left ear
left_ear_na <- srt_clean %>% 
  filter(is.na(srt_score_left) & !is.na(srt_score_right))
dim(left_ear_na)
#1554 who do not have srt score for right ear
right_ear_na <- srt_clean %>% 
  filter(is.na(srt_score_right) & !is.na(srt_score_left))
dim(right_ear_na)
#211763 have neither left ear nor right ear score
both_ear <- srt_clean %>% 
   filter(is.na(srt_score_left) & is.na(srt_score_right))
dim(both_ear)

In [None]:
#create a categorical variable to classify the score for left and right ear
#137821 individuals have either right or left score, or both (216526 individuals who have neither the right nor left score are deleted)
#134058 have either right or left score, or both  (deleting the outliers)
srt_cat <- srt_clean %>%
  filter(!is.na(srt_score_left) | !is.na(srt_score_right)) %>% 
  mutate(srt_left_cat = case_when(
        srt_score_left < -5.5   ~  "normal",
        -5.5 <= srt_score_left & srt_score_left <= -3.5   ~  "insufficient",
          srt_score_left > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  mutate(srt_right_cat = case_when(
        srt_score_right < -5.5   ~  "normal",
        -5.5 <= srt_score_right & srt_score_right <= -3.5   ~  "insufficient",
          srt_score_right > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  #remove the score 8 (outlier) for both ears
  filter(!(srt_score_left == 8) & !(srt_score_right == 8))

#to use the function max/min, replace all NA to 0 for numerical variables 
srt_fill_na <- srt_cat
srt_fill_na[is.na(clean <- srt_cat)] <- 0
 

# Create two new variables to show the best ear and the worst ear
srt_best_worst <-  srt_fill_na %>% 
  mutate(best = apply(select(., starts_with("f.200")), 1, function(x) min(x[x!=0])),
         worst = apply(select(., starts_with("f.200")), 1, function(x) max(x[x!=0]))
  ) %>% 
  #replace the Inf and -Inf values with 0
  mutate(best = if_else(best == Inf, 0, best),
         worst = if_else(worst == -Inf, 0, worst)
  )


#replace 0 with NA
library(dplyr) 
srt_best_worst <- na_if(srt_best_worst, 0)


#summary statistics 
summary(srt_best_worst$srt_score_left)

In [None]:
summary(srt_best_worst$srt_score_right)

In [None]:
summary_left <- srt_best_worst %>%
  filter(!is.na(srt_score_left)) %>% 
  summarise(std_left =sd(srt_score_left),
          mean_left = mean(srt_score_left),
          median_left = median(srt_score_left),
          n = n()
          ) 

summary_left 

In [None]:
summary_right <- srt_best_worst %>%
  filter(!is.na(srt_score_right)) %>% 
  summarise(std_right =sd(srt_score_right),
          mean_right = mean(srt_score_right),
          median_right = median(srt_score_right),
          n = n()
          ) 

summary_right

### Plots

In [None]:
#histogram for SRT Score for Left Ear
left_plot = srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(x = srt_score_left,color = srt_left_cat)) +
      geom_histogram(binwidth = 0.1) +
   labs(
    title = "SRT Score for Left Ear (by category)",
    x = "SRT Score for Left ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-10, -5.5, -3.5, 0, 3.5, 5.5),
    labels = c("-10", "-5.5", "-3.5", "0", "3.5", "5.5")) +
  scale_color_discrete("SRT Score Category",labels=c("normal","insuficient","poor")) 


left_plot

In [None]:
#QQ plot for SRT Score for Left Ear
qq_left_total <- srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(sample = (srt_score_left - mean(srt_score_left))/sd(srt_score_left))) +
  stat_qq() +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Left)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_left_total

In [None]:
qq_left_by_cat <- srt_best_worst %>% 
  filter(!is.na(srt_score_left)) %>% 
  ggplot(aes(sample = (srt_score_left - mean(srt_score_left))/sd(srt_score_left), color = srt_left_cat)) +
  stat_qq() +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Left)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_left_by_cat

In [None]:
#histogram for SRT Score for Right Ear 
right_plot <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(x = srt_score_right, color=srt_right_cat) ) +
      geom_histogram(binwidth = 0.1) +
  labs(
    title = "SRT Score for Right Ear (by category)",
    x = "SRT Score for Right ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-10, -5.5, -3.5, 0, 3.5, 5.5),
    labels = c("-10", "-5.5", "-3.5", "0", "3.5", "5.5")) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
  scale_color_discrete("SRT Score Category",labels=c("normal","insuficient","poor")) 
 
right_plot

In [None]:
#QQ plot for SRT Score for Right Ear
qq_right_total <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(sample = (srt_score_right - mean(srt_score_right))/sd(srt_score_right))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Right)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_right_total

In [None]:
qq_right_by_cat <- srt_best_worst %>% 
  filter(!is.na(srt_score_right)) %>% 
  ggplot(aes(sample = (srt_score_right - mean(srt_score_right))/sd(srt_score_right), color = srt_right_cat)) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line() +
  labs(title="QQ plot for SRT Score (Right)",
       y = "SRT Score Quantile",
       x = "Normal Quantile") +
theme_classic()

qq_right_by_cat

In [None]:
srt_best_worst_cat <- srt_best_worst %>%
  mutate(srt_best_cat = case_when(
        best < -5.5   ~  "normal",
        -5.5 <= best & best <= -3.5   ~  "insufficient",
          best > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  mutate(srt_worst_cat = case_when(
        worst < -5.5   ~  "normal",
        -5.5 <= worst & worst <= -3.5   ~  "insufficient",
          worst > -3.5  ~ "poor",
         TRUE                    ~ ""
  )) %>% 
  filter(!is.na(best) | !is.na(worst))



best_plot <- srt_best_worst_cat %>% 
  ggplot(aes(x = best, color=srt_best_cat) ) +
      geom_histogram(binwidth = 0.1) +
  labs(
    title = "SRT Score for the Best Ear (by category)",
    x = "SRT Score for the Best ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-10, -5.5, -3.5, 0, 3.5, 5.5),
    labels = c("-10", "-5.5", "-3.5", "0", "3.5", "5.5")) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
  scale_color_discrete("SRT Score Category",labels=c("normal","insuficient","poor")) 
 
best_plot

In [None]:
qq_best <- srt_best_worst_cat %>% 
  ggplot(aes(sample = (best - mean(best))/sd(best))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line(col= 'blue') +
  labs(title="QQ plot for the Best SRT Score",
       y = "SRT Score Quantiles",
       x = "normal Quantiles") + 
theme_classic()  

qq_best

In [None]:
worst_plot <- srt_best_worst_cat %>% 
  ggplot(aes(x = worst, color=srt_worst_cat)) +
      geom_histogram(binwidth = 0.1) +
  labs(
    title = "SRT Score for the Worst Ear (by category)",
    x = "SRT Score for the Best ",
    y = "Count"
    ) + 
   scale_x_continuous(
    breaks = c(-10, -5.5, -3.5, 0, 3.5, 5.5),
    labels = c("-10", "-5.5", "-3.5", "0", "3.5", "5.5")) +
  scale_y_continuous(
    breaks = c(2500, 5000, 7500, 10000, 12500, 15000),
    labels = c("2500", "5000", "7500", "10000", "12500", "15000")) +
  scale_color_discrete("SRT Score Category",labels=c("normal","insuficient","poor")) 
 

worst_plot

In [None]:
qq_worst <- srt_best_worst_cat %>% 
  ggplot(aes(sample = (worst - mean(worst))/sd(worst))) +
  stat_qq(distribution = stats::qnorm) +
  stat_qq_line(col= 'blue') +
  labs(title="QQ plot for the Worst SRT Score",
       y = "SRT Score Quantiles",
       x = "normal quantiles") +
theme_classic()  

qq_worst