# Phenotypes:
1. Hearing aids (f.3393)
2. Hearing difficulty/problems (f.2247)
3. Hearing difficulty/background noise (f.2257)

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for diverse hearing related phenotyes to perform association analyses using the LMM.ipynb. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/HI_UKBB
```

## Important phenotypic files

1. `200804_UKBB_HI_genotypeqc.csv` File containing all individuals that passed QC and hearing impairment variables
2. `200804_UKBB_HI_genotypeqc_excr.csv` File with applied exclusion criteria as indicated [here](https://docs.google.com/document/d/1cpxTzElpsEkwmBDjnMBHg2wW7CL1AcG_b0_0wE_k5rQ/edit). **Note**: this file excludes individuals with otosclerosis, Meniere's and other diseases, if you need to filter those particular phenotypes use file 1 instead.
3. `200811_UKBB_Tinnitus_plan1_2_3_f4803` File with filtered phenotypes for tinnitus plan 1,2 and 3 and imputed noise variables
4. `200814_UKBB_HI_genotypeqc_excr_impvars` Database with qc'ed individuals, exclusion criteria, noise imputed vars and tinnitus phenotypes

## Analysis plan

The phenotypes to be analyzed are the following:

1. Hearing aid user (f.3393)
"Do you use a hearing aid most of the time?"

2. Hearing difficulty/problems (f.2247)
"Do you have any difficulty with your hearing?"

3. Hearing difficulty/background noise (f.2257)
"Do you find it difficult to follow a conversation if there is background noise (such as TV, radio, children playing)?"

**Sex corresponds to f.22001 (genetic sex):**

- Male = 0
- Female = 1

**Noisy workplace and loud music exposure frequency: same as for Tinnitus**
                
1. Remove inconsistent individuals 
    - said 1,2 or 3 and in following visits said 0
    - said a higher exposure (e.g 3) and then a lower one (e.g 1 or 2) in following visits
2. Retain consistent individuals and use highest reported exposure

**The needs to be inverse normalized**

**Covariates to be included in the analysis include:**

1. Age at time of test (calculated from f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
2. Sex f.22001
3. Volume left ear f.4270 and right ear f.4277 (The volume set by the participant for the measurement which you are using in the analysis ir our case the last time they took the test). For the analysis we use the average of the right and left ear since there is overlap in the volume distribution
4. Noisy workplace f.4825
5. Loud music exposure f.4836


## Load libraries and set working dir

In [None]:
#Load libraries
library(plyr)
library(tidyverse)
library(pander)
library(ggpubr)
library(rapportools)
library(ggplot2)
#Get working directory
getwd()

In [2]:
#Set working directory
setwd('~/project/HI_UKBB')

In [3]:
# Clean workspace
rm(list=ls())

In [4]:
df.final.imp = read.csv('200814_UKBB_HI_genotypeqc_excr_impvars.csv')

## Data summary and recode

In [5]:
dim(df.final.imp)

In [6]:
table(df.final.imp$f.22001)


Female   Male 
192414 161933 

In [7]:
df.final.imp$sex <- revalue(df.final.imp$f.22001.0.0, c("Male" = '0', 'Female'='1' ))
table(df.final.imp$sex)
dim(df.final.imp)


     1      0 
192414 161933 

# f.3393 Hearing aid: filter out inconsistent cases

In [110]:
# Recode function:
recode<-function(df,column_name){
  new_names<-c()
  for (i in column_name){
    new_column_name<-paste0(i,"_recode")
    new_names<-c(new_names,new_column_name)
    df[,new_column_name] <- revalue(df[,i], c("No"= 0, 
                                            "Yes" =1,
                                            "Prefer not to answer"= NA))
  }
  return (list(df=df,new_column_names=new_names))
}

# columns needs to be recoded:
column_name<-c("f.3393.0.0","f.3393.1.0","f.3393.2.0","f.3393.3.0")

# get a new data.frame with recoded columns added:
df.final.imp<-recode(df=df.final.imp,column_name)$df

# get names of recoded columns:
new_column_names<-recode(df=df.final.imp,column_name)$new_column_names

# show recode summary:
for (i in new_column_names)
{cat(i,"summary:");print(table(df.final.imp[,i]));cat("\n")}

f.3393.0.0_recode summary:
     0      1 
202311   9962 

f.3393.1.0_recode summary:
    0     1 
14472   878 

f.3393.2.0_recode summary:
    0     1 
32852  2853 

f.3393.3.0_recode summary:
   0    1 
2225  184 



In [111]:
dim(df.final.imp)

## f. 3393 Get patterns for possible answers

In [112]:
# Extract subset of data only with the recode columns of tinnitus
data_sub <- df.final.imp %>%
  select("IID",all_of(new_column_names)) 

# Function to extract all the available answers for 4 visits
# and put them in one string as "0000", "111", "991", etc

f2<-function(x){
  visit<-''
  for (i in 2:5){ # do not take the first column (i.e. IID)
    if (!is.na(x[i]))
    {visit<-paste0(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=visit}
  return (visit)
}

# Apply the above function and remove NAs
data_sub$visit<-apply(data_sub, 1, f2)

names(table(data_sub$visit))

In [113]:
table(data_sub$visit)
all_pattern<-unique(data_sub$visit)


            0     00    000   0000   0001    001   0010   0011     01    010 
123790 191176  22628   3930    215     13    247      6     10   1830     24 
  0100    011   0110   0111      1     10    100    101     11    110   1100 
     1    192      1      4   9269     77     13     17    766      6      1 
   111   1111 
   125      6 

## f.3393 Get all the consistent cases

In [114]:
all_no<-all_pattern[which(grepl("0",all_pattern) & !grepl("1",all_pattern))]
all_yes<-all_pattern[which(!grepl("0",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%union(all_no,all_yes),"visit"])


     0     00    000   0000      1     11    111   1111 
191176  22628   3930    215   9269    766    125      6 

In [115]:
consistent_1<-c(all_no,all_yes)
yes_no<-all_pattern[which(grepl("0",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%yes_no,"visit"])


0001  001 0010 0011   01  010 0100  011 0110 0111   10  100  101  110 1100 
  13  247    6   10 1830   24    1  192    1    4   77   13   17    6    1 

In [116]:
might_inconsistent<-c(yes_no)

## Remove inconsistent cases

In [117]:
# get all the inconsistent cases:
exceptions<-c("0001","001","0011","01","011","0111")
inconsistent<-setdiff(might_inconsistent,exceptions)

cat("There are",length(which(is.empty(data_sub$visit))),"NAs, removed")

There are 123790 NAs, removed

In [118]:
cat("There are",length(which(data_sub$visit%in%inconsistent)),"inconsistent cases, removed")

There are 146 inconsistent cases, removed

In [119]:
# removed those NAs and inconsistent cases: 
IID_hearing_aid<-data_sub %>%
  filter(!is.empty(visit)) %>%
  filter((!visit%in%inconsistent)) %>%
  select(IID)

dim(IID_hearing_aid)

In [120]:
IID_hearing_aid<-IID_hearing_aid[,1]
cat("After removing all the NAs and inconsistent cases, there are",length(IID_hearing_aid)," individuals left")

After removing all the NAs and inconsistent cases, there are 230411  individuals left

In [121]:
# get the dataset after cleansing: 
df.hearing.aid<-df.final.imp %>%
  filter(IID%in%IID_hearing_aid)

dim(df.hearing.aid)

## Step 1: classify cases and controls

In [122]:
hearing_aid <- df.hearing.aid %>% 
  select(IID,FID,f.22001.0.0, f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode)
head(hearing_aid)

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>
1,1000019,1000019,Female,47,,,,0.0,,,
2,1000022,1000022,Male,53,,,,0.0,,,
3,1000035,1000035,Male,63,,,,0.0,,,
4,1000046,1000046,Female,62,,73.0,,,,0.0,
5,1000054,1000054,Female,65,,,,0.0,,,
6,1000063,1000063,Male,43,,,,0.0,,,


## Hearing aid f.3393

In [123]:
data_sub <- select(hearing_aid,starts_with("f.3393"))

# Function to define cases
f<-function(x){
  visit<-c()
  for (i in 1:4){
    if (!is.na(x[i]))
    {visit<-c(visit,x[i])}
  }
  if ("1" %in% visit){result= TRUE}
  else{result=FALSE}
  return (result)
}

# Apply the above function
hearing_aid$cases<-apply(data_sub, 1, f)
head(hearing_aid,10)

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,cases
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<lgl>
1,1000019,1000019,Female,47,,,,0.0,,,,False
2,1000022,1000022,Male,53,,,,0.0,,,,False
3,1000035,1000035,Male,63,,,,0.0,,,,False
4,1000046,1000046,Female,62,,73.0,,,,0.0,,False
5,1000054,1000054,Female,65,,,,0.0,,,,False
6,1000063,1000063,Male,43,,,,0.0,,,,False
7,1000078,1000078,Female,52,57.0,60.0,,,0.0,0.0,,False
8,1000105,1000105,Female,54,,,,0.0,,,,False
9,1000112,1000112,Male,58,,68.0,,,,1.0,,True
10,1000141,1000141,Female,49,,,,0.0,,,,False


In [124]:
hearing_aid_cases <- hearing_aid %>% 
  filter(cases == TRUE)
head(hearing_aid_cases)
cat(dim(hearing_aid_cases)[1], "individuals are cases for f.3393")

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,cases
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<lgl>
1,1000112,1000112,Male,58,,68.0,,,,1.0,,True
2,1001067,1001067,Male,50,,,,1.0,,,,True
3,1001384,1001384,Female,61,,,,1.0,,,,True
4,1001459,1001459,Male,64,,,,1.0,,,,True
5,1002548,1002548,Male,62,,,,1.0,,,,True
6,1002888,1002888,Male,68,,,,1.0,,,,True


12462 individuals are cases for f.3393

In [125]:
hearing_aid_controls <- hearing_aid %>% 
  filter(cases == FALSE)
head(hearing_aid_controls)
cat(dim(hearing_aid_controls)[1], "individuals are controls for f.3393")

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,cases
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<lgl>
1,1000019,1000019,Female,47,,,,0.0,,,,False
2,1000022,1000022,Male,53,,,,0.0,,,,False
3,1000035,1000035,Male,63,,,,0.0,,,,False
4,1000046,1000046,Female,62,,73.0,,,,0.0,,False
5,1000054,1000054,Female,65,,,,0.0,,,,False
6,1000063,1000063,Male,43,,,,0.0,,,,False


217949 individuals are controls for f.3393

In [126]:
dim(hearing_aid)[1]-217949 -12462

## Recode cases=1 and controls=0

In [127]:
hearing_aid$hearing_aid_cat <- as.integer(as.logical(hearing_aid$cases))
head(hearing_aid, 10)
# Get the number of NAs
#length(which(is.na(hearing_all$hearing_aid_cat)))

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,cases,hearing_aid_cat
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<lgl>,<int>
1,1000019,1000019,Female,47,,,,0.0,,,,False,0
2,1000022,1000022,Male,53,,,,0.0,,,,False,0
3,1000035,1000035,Male,63,,,,0.0,,,,False,0
4,1000046,1000046,Female,62,,73.0,,,,0.0,,False,0
5,1000054,1000054,Female,65,,,,0.0,,,,False,0
6,1000063,1000063,Male,43,,,,0.0,,,,False,0
7,1000078,1000078,Female,52,57.0,60.0,,,0.0,0.0,,False,0
8,1000105,1000105,Female,54,,,,0.0,,,,False,0
9,1000112,1000112,Male,58,,68.0,,,,1.0,,True,1
10,1000141,1000141,Female,49,,,,0.0,,,,False,0


## Step 2: get the ages for hearing aids (f.3393)

In [128]:
# Get the subset data of cases
aid_age_cases = hearing_aid %>% 
  filter(hearing_aid_cat=="1")  %>%
  select(IID, f.3393.0.0_recode, f.3393.1.0_recode, f.3393.2.0_recode, f.3393.3.0_recode, f.21003.0.0, f.21003.1.0, f.21003.2.0, f.21003.3.0)
head(aid_age_cases,12)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>
1,1000112,,,1.0,,58,,68.0,
2,1001067,1.0,,,,50,,,
3,1001384,1.0,,,,61,,,
4,1001459,1.0,,,,64,,,
5,1002548,1.0,,,,62,,,
6,1002888,1.0,,,,68,,,
7,1002944,1.0,,,,65,,,
8,1003258,0.0,1.0,,,69,74.0,,
9,1004012,1.0,,,,57,,,
10,1004218,1.0,,,,66,,,


In [129]:
# Get the # of column where first replied Yes:
aid_age_cases$visit_idx = apply(aid_age_cases, 1, function(x) unlist(which(x == '1')))
head(aid_age_cases)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,visit_idx
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<list>
1,1000112,,,1.0,,58,,68.0,,4
2,1001067,1.0,,,,50,,,,2
3,1001384,1.0,,,,61,,,,2
4,1001459,1.0,,,,64,,,,2
5,1002548,1.0,,,,62,,,,2
6,1002888,1.0,,,,68,,,,2


In [130]:
# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.3393.0.0_recode)
offset = which(colnames(aid_age_cases) == 'f.21003.0.0') - which(colnames(aid_age_cases) == 'f.3393.0.0_recode')

# Define the function to extract the first time they said yes for cases 
f=get_age_func <- function(x) {
  visit_index=x[which(colnames(aid_age_cases)=="visit_idx")]
  index=min(unlist(visit_index))+offset
  age=x[index]
  final_age=unlist(age)
  if(is.null(final_age))
  {final_age<-NA}
  return(final_age)
}

# Get the final age for cases
aid_age_cases$age_final = apply(aid_age_cases, 1, f)

# Show first 6 rows
head(aid_age_cases)
summary(aid_age_cases$age_final)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,visit_idx,age_final
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<list>,<int>
1,1000112,,,1.0,,58,,68.0,,4,68
2,1001067,1.0,,,,50,,,,2,50
3,1001384,1.0,,,,61,,,,2,61
4,1001459,1.0,,,,64,,,,2,64
5,1002548,1.0,,,,62,,,,2,62
6,1002888,1.0,,,,68,,,,2,68


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.00   61.00   65.00   63.83   68.00   81.00 

In [131]:
### Extract age for Control (3393)
aid_age_control <- hearing_aid %>% 
  filter(hearing_aid_cat == "0") %>%
  select(IID, f.3393.0.0_recode, f.3393.1.0_recode, f.3393.2.0_recode, f.3393.3.0_recode, f.21003.0.0, f.21003.1.0, f.21003.2.0, f.21003.3.0)
head(aid_age_control)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>
1,1000019,0.0,,,,47,,,
2,1000022,0.0,,,,53,,,
3,1000035,0.0,,,,63,,,
4,1000046,,,0.0,,62,,73.0,
5,1000054,0.0,,,,65,,,
6,1000063,0.0,,,,43,,,


In [132]:
# Get the # of column where last replied No:
aid_age_control$visit_idx = apply(aid_age_control, 1, function(x) unlist(which(x == '0')))

# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.3393.0.0_recode)
offset = which(colnames(aid_age_control) == 'f.21003.0.0') - which(colnames(aid_age_control) == 'f.3393.0.0_recode')

# Define the function to extract the last time they said no for control

f=get_age_func <- function(x) {
  visit_index=x[which(colnames(aid_age_control)=="visit_idx")]
  index=max(unlist(visit_index))+offset
  age=x[index]
  age=unlist(age)
  return(age)
}

# Get the final age for controls
aid_age_control$age_final = apply(aid_age_control, 1, f)

# Show first 6 rows
head(aid_age_control)
summary(aid_age_control$age_final)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,visit_idx,age_final
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<list>,<int>
1,1000019,0.0,,,,47,,,,2,47
2,1000022,0.0,,,,53,,,,2,53
3,1000035,0.0,,,,63,,,,2,63
4,1000046,,,0.0,,62,,73.0,,4,73
5,1000054,0.0,,,,65,,,,2,65
6,1000063,0.0,,,,43,,,,2,43


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   39.0    53.0    60.0    58.6    65.0    82.0 

In [133]:
# Merge age_cases and age_controls
hearing_aid_age <- rbind(aid_age_cases, aid_age_control) 
dim(hearing_aid_age)
head(hearing_aid_age)

Unnamed: 0_level_0,IID,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,visit_idx,age_final
Unnamed: 0_level_1,<int>,<fct>,<fct>,<fct>,<fct>,<int>,<int>,<int>,<int>,<list>,<int>
1,1000112,,,1.0,,58,,68.0,,4,68
2,1001067,1.0,,,,50,,,,2,50
3,1001384,1.0,,,,61,,,,2,61
4,1001459,1.0,,,,64,,,,2,64
5,1002548,1.0,,,,62,,,,2,62
6,1002888,1.0,,,,68,,,,2,68


In [134]:
hearing_cat_age = merge(x = hearing_aid ,y = hearing_aid_age [,c("IID","age_final")],by="IID", all.x=TRUE)
head(hearing_cat_age)

Unnamed: 0_level_0,IID,FID,f.22001.0.0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,cases,hearing_aid_cat,age_final
Unnamed: 0_level_1,<int>,<int>,<fct>,<int>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<lgl>,<int>,<int>
1,1000019,1000019,Female,47,,,,0.0,,,,False,0,47
2,1000022,1000022,Male,53,,,,0.0,,,,False,0,53
3,1000035,1000035,Male,63,,,,0.0,,,,False,0,63
4,1000046,1000046,Female,62,,73.0,,,,0.0,,False,0,73
5,1000054,1000054,Female,65,,,,0.0,,,,False,0,65
6,1000063,1000063,Male,43,,,,0.0,,,,False,0,43


In [135]:
#Merge with complete database keep the all the rows from original db
df_3393 = merge(x = df.hearing.aid, y = hearing_cat_age [,c("IID","hearing_aid_cat","age_final")],by="IID", all.x=TRUE)
dim(df_3393)
head(df_3393)

Unnamed: 0_level_0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,⋯,loud_music,noise_imp,music_imp,sex,f.3393.0.0_recode,f.3393.1.0_recode,f.3393.2.0_recode,f.3393.3.0_recode,hearing_aid_cat,age_final
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<fct>,<int>,<fct>,<fct>,⋯,<int>,<int>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<int>,<int>
1,1000019,1000019,0,0,2,-9,Female,1960,2008-01-24,,⋯,,,,1,0.0,,,,0,47
2,1000022,1000022,0,0,1,-9,Male,1954,2008-01-22,,⋯,,,,0,0.0,,,,0,53
3,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,⋯,,,,0,0.0,,,,0,63
4,1000046,1000046,0,0,2,-9,Female,1946,2008-12-01,,⋯,0.0,0.0,0.0,1,,,0.0,,0,73
5,1000054,1000054,0,0,2,-9,Female,1942,2007-11-23,,⋯,,,,1,0.0,,,,0,65
6,1000063,1000063,0,0,1,-9,Male,1967,2010-06-26,,⋯,0.0,0.0,0.0,0,0.0,,,,0,43


## Hearing difficulty/problems (2247)
### step1: classify cases and controls

In [136]:
table(df.final.imp$f.2247.0.0)


         Do not know I am completely deaf                   No 
               13469                   56               253610 
Prefer not to answer                  Yes 
                 170                86713 

In [137]:
# Recode function:
recode<-function(df,column_name){
  new_names<-c()
  for (i in column_name){
    new_column_name<-paste0(i,"_recode")
    new_names<-c(new_names,new_column_name)
    df[,new_column_name] <- revalue(df[,i], c("No"= 0, 
                                            "Yes" =1,
                                            "Prefer not to answer"= NA,
                                             "Do not know"=9,
                                             "I am completely deaf"=NA))
  }
  return (list(df=df,new_column_names=new_names))
}

# columns needs to be recoded:
column_name<-c("f.2247.0.0","f.2247.1.0","f.2247.2.0","f.2247.3.0")

# get a new data.frame with recoded columns added:
df.final.imp<-recode(df=df.final.imp,column_name)$df

# get names of recoded columns:
new_column_names<-recode(df=df.final.imp,column_name)$new_column_names

# show recode summary:
for (i in new_column_names)
{cat(i,"summary:");print(table(df.final.imp[,i]));cat("\n")}

The following `from` values were not present in `x`: I am completely deaf

The following `from` values were not present in `x`: I am completely deaf



f.2247.0.0_recode summary:
     9      0      1 
 13469 253610  86713 

f.2247.1.0_recode summary:
   9    0    1 
 785 9709 4861 

f.2247.2.0_recode summary:
    9     0     1 
 1921 21757 12038 

f.2247.3.0_recode summary:
   9    0    1 
 128 1474  808 



## Get possible answers f.2247

In [None]:
# Extract subset of data only with the recode columns of tinnitus
data_sub <- df.final.imp %>%
  select("IID",all_of(new_column_names)) 

# Function to extract all the available answers for 4 visits
# and put them in one string as "0000", "111", "991", etc

f2<-function(x){
  visit<-''
  for (i in 2:5){ # do not take the first column (i.e. IID)
    if (!is.na(x[i]))
    {visit<-paste0(visit,x[i])}
  }
  if(is.null(visit)){visit=NA}
  else{visit=visit}
  return (visit)
}

# Apply the above function and remove NAs
data_sub$visit<-apply(data_sub, 1, f2)

names(table(data_sub$visit))

In [None]:
table(data_sub$visit)
all_pattern<-unique(data_sub$visit)

## f.2247 Get all the consistent cases

In [None]:
all_no<-all_pattern[which(grepl("0",all_pattern) & !grepl("9",all_pattern) & !grepl("1",all_pattern))]
all_yes<-all_pattern[which(!grepl("0",all_pattern) & !grepl("9",all_pattern) & grepl("1",all_pattern))]
all_do_not_know<-all_pattern[which(!grepl("0",all_pattern) & grepl("9",all_pattern) & !grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%union(union(all_no,all_yes),all_do_not_know),"visit"])

In [None]:
consistent_1<-c(all_no,all_yes,all_do_not_know)

# might be inconsistent cases:
do_not_know_no<-all_pattern[which(grepl("0",all_pattern) & grepl("9",all_pattern) & !grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%do_not_know_no,"visit"])

In [None]:
do_not_know_yes<-all_pattern[which(!grepl("0",all_pattern) & grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%do_not_know_yes,"visit"])

In [None]:
yes_no<-all_pattern[which(grepl("0",all_pattern) & !grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%yes_no,"visit"])

In [None]:
with_all_three<-all_pattern[which(grepl("0",all_pattern) & grepl("9",all_pattern) & grepl("1",all_pattern))]
table(data_sub[data_sub$visit%in%with_all_three,"visit"])

In [None]:
might_inconsistent<-c(do_not_know_no,do_not_know_yes,yes_no,with_all_three)

## f.2247 Remove inconsistent cases

In [None]:
# get all the inconsistent cases:
exceptions<-c("91","911","991","0001","001","0011","01","011","0111", "091", "901", "9011")
inconsistent<-setdiff(might_inconsistent,exceptions)

cat("There are",length(which(is.empty(data_sub$visit))),"NAs, removed")

In [None]:
cat("There are",length(which(data_sub$visit%in%inconsistent)),"inconsistent cases, removed")

In [None]:
# removed those NAs and inconsistent cases: 
IID_cleaned<-data_sub %>%
  filter(!is.empty(visit)) %>%
  filter((!visit%in%inconsistent)) %>%
  select(IID)

dim(IID_cleaned)

In [None]:
IID_cleaned<-IID_cleaned[,1]
cat("After removing all the NAs and inconsistent cases, there are",length(IID_cleaned),"cases left")

In [None]:
# get the dataset after cleansing: 
data_cleaned<-df_clean %>%
  filter(IID%in%IID_cleaned)

dim(data_cleaned)

In [None]:
#
hearing_diff <- hearing_clean %>% 
  mutate(cases = apply(select(.,starts_with("f.2247")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))
  )


hearing_diff$control = with(hearing_diff, ifelse(f.2247.0.0 %in% c("No",NA) & f.2247.1.0 %in% c("No", NA) & f.2247.2.0 %in% c("No",NA) & f.2247.3.0 %in% c("No",NA) 
                                                 & !(f.2247.0.0 %in% c(NA) & f.2247.1.0 %in% c(NA) & f.2247.2.0 %in% c(NA) & f.2247.3.0 %in% c(NA)),"FALSE", NA)) 



#90761 individuals are cases for 2247  (also have hearing aid data)
hearing_diff_cases <- hearing_diff %>% 
  filter(cases == "TRUE")


#125358 individuals are controls for 2247 (also have hearing aid data)
hearing_diff_control <- hearing_diff %>% 
  filter(control == "FALSE") %>% 
  select(-cases)

#216119 individulas who are either case or control for 2247 (also have hearing aid data)
hearing_diff <- hearing_diff %>% 
  filter(cases == "TRUE" | control == "FALSE")


###merge cases and controls

hearing_diff$hearing_diff_cat <- coalesce(hearing_diff$cases, as.logical(hearing_diff$control))

hearing_diff <- hearing_diff %>% 
  select(-cases, -control) %>% 
  mutate(hearing_diff_cat = as.factor(hearing_diff_cat),
         hearing_diff_cat = recode(hearing_diff_cat,"FALSE"="control","TRUE"="case"))

#85 state they have no hearing difficulty but they wear hearing aids 
check_inconsistence <- hearing_diff %>% 
  filter(hearing_diff_cat == "control" & hearing_aid_cat == "case") 

#reclassify these 85 individulas as cases for 2247
hearing_diff <- hearing_diff %>% 
  mutate(hearing_diff_cat_new = case_when(
    hearing_diff_cat == "control" & hearing_aid_cat == "case" ~ "case",
    hearing_diff_cat == "case" & hearing_aid_cat == "case" ~ "case",
    hearing_diff_cat == "case" & hearing_aid_cat == "control" ~ "case",
    hearing_diff_cat == "control" & hearing_aid_cat == "control" ~ "control"))

# 90761 + 85 = 90846 who are cases
check_merge <- hearing_diff %>% 
  filter(hearing_diff_cat_new == "case")

### Step 2: get the ages for Hearing difficulty/problems (2247)
### Extract age for Control (2247)
```{r age for control,warning=FALSE, collapse=FALSE,echo=T}
hearing_diff_age_control <- hearing_diff %>% 
  filter(hearing_diff_cat_new == "control") 

#get the age at last visit for control
offset = which(colnames(hearing_diff_age_control) == 'f.21003.0.0') - which(colnames(hearing_diff_age_control) == 'f.2247.0.0')

hearing_diff_age_control$age_diff = apply(hearing_diff_age_control, 1, function(x) {
  hear_aid = which(x[grep("f.2247", names(x))] == "No")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
})

res<-head(hearing_diff_age_control)

In [None]:
#for those who are cases for hearing difficuty -- get the age at first visit for case
hearing_diff_age_case <- hearing_diff %>% 
  filter(hearing_diff_cat == "case") 

offset = which(colnames(hearing_diff_age_case) == 'f.21003.0.0') - which(colnames(hearing_diff_age_case) == 'f.2247.0.0')

hearing_diff_age_case$age_diff = apply(hearing_diff_age_case, 1, function(x) {
  hear_aid =  which(x[grep("f.2247", names(x))] == "Yes")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[min(hear_aid) + first_index_offset + offset])
})

res<-head(hearing_diff_age_case)

#for those who are control for hearing difficuty (2247) but cases for hearing aid (3393) -- get the age of 2247 at last visit  (85 individuals)

hearing_diff_age_case_2 <- hearing_diff %>% 
 filter(hearing_diff_cat == "control" & hearing_aid_cat == "case")

offset = which(colnames(hearing_diff_age_case_2 ) == 'f.21003.0.0') - which(colnames(hearing_diff_age_case_2) == 'f.2247.0.0')

hearing_diff_age_case_2$age_diff = apply(hearing_diff_age_case_2, 1, function(x) {
  hear_aid =  which(x[grep("f.2247", names(x))] == "No")
  first_index_offset = grep("f.2247", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
    })


hearing_diff_clean <- rbind(hearing_diff_age_case, hearing_diff_age_control,hearing_diff_age_case_2) 
dim(hearing_diff_clean)

## Hearing difficulty/background noise (2257)
### step1: classify cases and controls

In [None]:
hearing_noise <- hearing_diff_clean %>% 
  mutate(cases = apply(select(.,starts_with("f.2257")), 1, function(x) length(which(x == "Yes")) > 0 & max(which(x != "Yes")) < min(which(x == "Yes")))
  )

hearing_noise$control = with(hearing_noise, ifelse(f.2257.0.0 %in% c("No",NA) & f.2257.1.0 %in% c("No", NA) & f.2257.2.0 %in% c("No",NA) & f.2257.3.0 %in% c("No",NA) 
                                                 & !(f.2257.0.0 %in% c(NA) & f.2257.1.0 %in% c(NA) & f.2257.2.0 %in% c(NA) & f.2257.3.0 %in% c(NA)),"FALSE", NA)) 

#123870 individuals are cases for 2257  (also have hearing aid data + hearing difficulty)
hearing_noise_cases <- hearing_noise %>% 
  filter(cases == "TRUE")


#86775 individuals are controls for 2257 (also have hearing aid data + hearing difficulty)
hearing_noise_control <- hearing_noise %>% 
  filter(control == "FALSE") %>% 
  select(-cases)

#210645 individulas who are either case or control for 2257 (also have hearing aid data)
hearing_noise<- hearing_noise %>% 
  filter(cases == "TRUE" | control == "FALSE")
```


### merge cases and controls (2257)
```{r merge cases and controls,warning=FALSE, collapse=FALSE,echo=T}
#merge the cases and controls for hearing noise (2257)
hearing_noise$hearing_noise_cat <- coalesce(hearing_noise$cases, as.logical(hearing_noise$control))

hearing_noise <- hearing_noise %>% 
  select(-cases, -control) %>% 
  mutate(hearing_noise_cat = as.factor(hearing_noise_cat),
         hearing_noise_cat = recode(hearing_noise_cat,"FALSE"="control","TRUE"="case"))


#60827 inconsistent for hearing_noise and hearing_diff (conflict of cases and control)
check_noise_inconsistence <- hearing_noise %>% 
  filter(hearing_noise_cat != hearing_diff_cat_new)

#149818 left in the study (Individuals who are cases for one field and controls for the other field should be removed from the analysis)  210645 - 60827 = 149818
hearing_noise <- hearing_noise %>% 
  filter(hearing_noise_cat == hearing_diff_cat_new)

```



### Step 2: get the ages for Hearing difficulty/background noise (2257) 
### Extract age for Control (2257)
```{r age for control,warning=FALSE, collapse=FALSE,echo=T}
noise_age_control <- hearing_noise %>% 
  filter(hearing_noise_cat == "control") 

#get the age at last visit for controls
offset = which(colnames(noise_age_control) == 'f.21003.0.0') - which(colnames(noise_age_control) == 'f.2257.0.0')

noise_age_control$age_noise = apply(noise_age_control, 1, function(x) {
  hear_aid = which(x[grep("f.2257", names(x))] == "No")
  first_index_offset = grep("f.2257", names(x))[1] - 1
  unlist(x[max(hear_aid) + first_index_offset + offset])
})

res<-head(noise_age_control)

```


### Extract age for Cases (2257)
```{r age for cases,warning=FALSE, collapse=FALSE,echo=T}
noise_age_case <- hearing_noise %>% 
  filter(hearing_noise_cat == "case") 


#get the age at first visit for cases
offset = which(colnames(noise_age_case) == 'f.21003.0.0') - which(colnames(noise_age_case) == 'f.2257.0.0')

noise_age_case$age_noise = apply(noise_age_case, 1, function(x) {
  hear_aid = which(x[grep("f.2257", names(x))] == "Yes")
  first_index_offset = grep("f.2257", names(x))[1] - 1
  unlist(x[min(hear_aid) + first_index_offset + offset])
})

res<-head(noise_age_case)

hearing_noise_clean <- rbind(noise_age_case, noise_age_control) 



#3252 age inconsistent for 2247 and 2257
age_inconsistence <- hearing_noise_clean %>% 
  filter(age_diff != age_noise) %>% 
  select(IID,FID,f.31.0.0, f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0, f.2257.1.0, f.2257.2.0, f.2257.3.0,age_diff, age_noise, hearing_noise_cat)


#get the minimum age for 2247 and 2257
hearing_noise_clean <- transform(hearing_noise_clean, age_diff_noise = pmin(age_noise, age_diff)) 

#drop unneccessary variables
hearing_noise_clean <- hearing_noise_clean %>% 
  select(-hearing_diff_cat, -age_diff,-age_noise,-age_aid)