# Tinnitus project

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for tinnitus to perform association analyses. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/tinnitus
```

## Subset the data using variables of interest

Using the ukbconvert software and a list of pre-specified variables

```
./ukbconv ukb42495.enc_ukb r -i/home/dc2325/project/tinnitus/selectvars_062520.txt -o/home/dc2325/project/tinnitus/ukb42495_subset062520
```

In [None]:
# Analysis of the tinnitus data
getwd()
setwd('../TINNITUS_UKBB')
# Clean workspace
rm(list=ls())

# Step 1 run script to import data to R
Rscript ukb42495_subset062520.r
nrow(bd)

# Step 2 import as data.frame individuals that have a genotype using the already qc'ed genotypic files

data.geno <- read.table("UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.fam", header= FALSE, stringsAsFactors = FALSE)
names(data.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(data.geno)

# Step 3 assign individual ID column
names(bd)[1] <- "IID"

# Step 4 Merge the two data frames
data.geno.pheno <-merge(data.geno, bd, by="IID", all=FALSE)
nrow(data.geno.pheno)

# Step 5 Save as csv file
write.csv(data.geno.pheno,'UKBB_071020_HI_genotypeqc.csv', row.names = FALSE)


Apply the exclusion criteria defined by the group to remove unwanted individuals. This takes into account ICD10 codes, ICD9 codes and f.20002 (self-report)

In [None]:
# To get a list of removed individuals. Make sure teh list with the patterns each line ends in $
grep -w -f 200713_ICDcodes_exclusion.txt UKBB_071020_HI_genotypeqc.csv > 200713_UKBB_excluded_individuals.csv
cat 200713_UKBB_excluded_individuals.csv | wc -l
# To get the clean db with the included individuals
grep -wv -f 200713_ICDcodes_exclusion.txt UKBB_071020_HI_genotypeqc.csv > 200713_UKBB_genotypeqc_tinnitus_excr.csv
cat 200713_UKBB_genotypeqc_clean.csv | wc -l
# To obtain the duplicate lines (if they exist)
comm -12 <(sort 200713_UKBB_genotypeqc_tinnitus_excr.csv) <(sort 200713_UKBB_excluded_individuals.csv)


In [None]:
# Import clean data
df_clean = read.csv(file = "200713_UKBB_genotypeqc_clean.csv", header=TRUE)

dim(df_clean)

# Summarize the data on tinnitus phenotype for each of the instances

summary(data.geno.pheno$f.4803.0.0)
summary(data.geno.pheno$f.4803.1.0)
summary(data.geno.pheno$f.4803.2.0)
summary(data.geno.pheno$f.4803.3.0)

# Recode the f.4308 for every instance Yes contains all three categories and No is No, never. Do not know and Prefer not to answer are kept as they are

library(plyr)
data_clean$f.4803.0.0_recode <- revalue(data_clean$f.4803.0.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"=NA,"Do not know"="Do not know"))
data_clean$f.4803.1.0_recode <- revalue(data_clean$f.4803.1.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"=NA,"Do not know"="Do not know"))
data_clean$f.4803.2.0_recode <- revalue(data_clean$f.4803.2.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"=NA,"Do not know"="Do not know"))
data_clean$f.4803.3.0_recode <- revalue(data_clean$f.4803.3.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"=NA,"Do not know"="Do not know"))

summary(data_clean$f.4803.0.0_recode)
summary(data_clean$f.4803.1.0_recode)
summary(data_clean$f.4803.2.0_recode)
summary(data_clean$f.4803.3.0_recode) 

#Filtering based on different instances for tinnitus phenotype

data_clean$cases <- with(data_clean, ifelse(f.4803.0.0_recode == "No" & (f.4803.1.0_recode == "Yes" | f.4803.2.0_recode == "Yes" | f.4803.3.0_recode == "Yes")
                                                      & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode == "Yes" & f.4803.2.0_recode  %in% c("No", "Do not know") & f.4803.3.0_recode %in% c("No", "Do not know",NA)) 
                                                      & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode %in% c("No", "Do not know") & f.4803.2.0_recode == "Yes" & f.4803.3.0_recode %in% c("No", "Do not know"))
                                                      & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode == "Yes" & f.4803.2.0_recode == "Yes" & f.4803.3.0_recode %in% c("No", "Do not know"))
                                                      | (f.4803.0.0_recode %in% c("Yes",NA) & (f.4803.1.0_recode %in% c("Yes",NA) | f.4803.2.0_recode %in% c("Yes",NA) | f.4803.3.0_recode %in% c("Yes",NA))
                                                         & !(f.4803.0.0_recode %in% c("Yes",NA) & (f.4803.1.0_recode %in% c("No", "Do not know") | f.4803.2.0_recode %in% c("No", "Do not know") | f.4803.3.0_recode %in% c("No", "Do not know")))
                                                         & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA))),
                                                      "Yes", NA))
# Number of cases
table(data_clean$cases)

data_clean$controls <- with(data_clean, ifelse(f.4803.0.0_recode %in% c("No",NA) & f.4803.1.0_recode %in% c("No", NA) & f.4803.2.0_recode %in% c("No",NA) & f.4803.3.0_recode %in% c("No",NA)
                                                         & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA)),"No", NA))

# Number of controls
table(data_clean$controls)

# Creates a column with the binary status for tinnitus of the individuals

data_clean$tinnitus <- coalesce(data_clean$cases, data_clean$controls)

table(data_clean$tinnitus)

# Get the number of NAs
length(which(is.na(data_clean$tinnitus)))

                                 
# Get the "age at onset" using f.21003 Age when attended assessment centre for each of the instances
# For cases first time they replied yes to f.4803
# Get the subset of data to extract age

age_all = data_clean %>% 
  filter(!is.na(tinnitus)) %>%
  select(IID,tinnitus, f.4803.0.0_recode, f.4803.1.0_recode, f.4803.2.0_recode, f.4803.3.0_recode, f.21003.0.0, f.21003.1.0, f.21003.2.0, f.21003.3.0)  # data_filed 210003: Age when attended assessment centre
head(age_all)

library(pander)
res<-head(age_all)
pandoc.table(res)

# Get the subset data of cases
age_cases = age_all %>% 
  filter(tinnitus=="Yes")  %>%
  select(IID,f.4803.0.0_recode,f.4803.1.0_recode,f.4803.2.0_recode,f.4803.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
res<-head(age_cases,12)
pandoc.table(res)

# Get the # of column where first replied Yes:
age_cases$visit_idx = apply(age_cases, 1, function(x) unlist(which(x == 'Yes')))

# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.4803.0.0_recode)
offset = which(colnames(age_cases) == 'f.21003.0.0') - which(colnames(age_cases) == 'f.4803.0.0_recode')

# Define the function to extract the first time they said yes for cases 
f=get_age_func <- function(x) {
  visit_index=x[which(colnames(age_cases)=="visit_idx")]
  index=min(unlist(visit_index))+offset
  age=x[index]
  final_age=unlist(age)
  if(is.null(final_age))
  {final_age<-NA}
  return(final_age)
}

# Get the final age
age_cases$age_final = apply(age_cases, 1, f)

# Show first 6 rows
res<-head(age_cases)
pandoc.table(res)
summary(age_cases$age_final)

# Get the subset data of controls
age_control = age_all %>% 
  filter(tinnitus=="No")  %>%
  select(IID,f.4803.0.0_recode,f.4803.1.0_recode,f.4803.2.0_recode,f.4803.3.0_recode,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0)
res<-head(age_control,12)
pandoc.table(res)

# Get the # of column where last replied No:
age_control$visit_idx = apply(age_control, 1, function(x) unlist(which(x == 'No')))

# Define offset:
# offset: refers to the # of columns between the first age column (i.e.f.21003.0.0) and the first recode column (i.e.f.4803.0.0_recode)
offset = which(colnames(age_control) == 'f.21003.0.0') - which(colnames(age_control) == 'f.4803.0.0_recode')

# Define the function to extract the last time they said no for control

f=get_age_func <- function(x) {
  visit_index=x[which(colnames(age_control)=="visit_idx")]
  index=max(unlist(visit_index))+offset
  age=x[index]
  age=unlist(age)
  return(age)
}

# Get the final age
age_control$age_final = apply(age_control, 1, f)

# Show first 6 rows
res<-head(age_control)
pandoc.table(res)
summary(age_control$age_final)

# Merge age_cases and age_controls
age_tinnitus <- rbind(age_cases, age_control) 
dim(age_tinnitus)

#Merge with complete database
data_clean_age = merge(data_clean,age_tinnitus,by="IID")


In [None]:
# Get the number of the column

WORD="f.4803.0.0"; head -n1 ukb42495.tab | tr "\t" "\n" | grep -n $WORD
