# Tinnitus project

## Aim

Create a dataset of filtered individuals using the inclusion and exclusion criteria for tinnitus to perform association analyses. 

## Location of files

In the shared folder is the original UKBB data
```
/SAY/dbgapstg/scratch/UKBiobank/phenotype_files/pleiotropy_R01/ukb42495_updatedJune2020
```

In my personal folder the filtered dataset

```
/home/dc2325/project/tinnitus
```

## Subset the data using variables of interest

Using the ukbconvert software and a list of pre-specified variables

```
./ukbconv ukb42495.enc_ukb r -i/home/dc2325/project/tinnitus/selectvars_062520.txt -o/home/dc2325/project/tinnitus/ukb42495_subset062520
```

In [None]:
# Analysis of the tinnitus data
getwd()
setwd('../TINNITUS_UKBB')
# Clean workspace
rm(list=ls())

# Step 1 run script to import data to R
Rscript ukb42495_subset062520.r
nrow(bd)

# Step 2 import as data.frame individuals that have a genotype using the already qc'ed genotypic files

data.geno <- read.table("UKB_Caucasians_phenotypeindepqc120319_updated020720removedwithdrawnindiv.fam", header= FALSE, stringsAsFactors = FALSE)
names(data.geno) <-c("FID","IID","ignore1", "ignore2", "ignore3", "ignore4")
nrow(data.geno)

# Step 3 assign individual ID column
names(bd)[1] <- "IID"

# Step 4 Merge the two data frames
data.geno.pheno <-merge(data.geno, bd, by="IID", all=FALSE)
nrow(data.geno.pheno)

# Step 5 Save as csv file
write.csv(data.geno.pheno,'UKBB_071020_HI_genotypeqc.csv', row.names = FALSE)


Apply the exclusion criteria defined by the group to remove unwanted individuals. This takes into account ICD10 codes, ICD9 codes and f.20002 (self-report)

In [None]:
# To get a list of removed individuals. Make sure teh list with the patterns each line ends in $
grep -w -f 200713_ICDcodes_exclusion.txt UKBB_071020_HI_genotypeqc.csv > 200713_UKBB_excluded_individuals.csv
cat 200713_UKBB_excluded_individuals.csv | wc -l
# To get the clean db with the included individuals
grep -F -v -f 200713_ICDcodes_exclusion.txt UKBB_071020_HI_genotypeqc.csv > 200713_UKBB_genotypeqc_clean.csv
cat 200713_UKBB_genotypeqc_clean.csv | wc -l
comm -12 <(sort 200713_UKBB_genotypeqc_clean.csv) <(sort 200713_UKBB_excluded_individuals_header.csv)


In [None]:
# Import clean data
df_clean = read.csv(file = "200713_UKBB_genotypeqc_clean.csv")

dim(df_clean)

# Summarize the data on tinnitus phenotype for each of the instances

summary(data.geno.pheno$f.4803.0.0)
summary(data.geno.pheno$f.4803.1.0)
summary(data.geno.pheno$f.4803.2.0)
summary(data.geno.pheno$f.4803.3.0)

# Recode the f.4308 for every instance Yes contains all three categories and No is No, never. Do not know and Prefer not to answer are kept as they are

library(plyr)
data.geno.pheno$f.4803.0.0_recode <- revalue(data.geno.pheno$f.4803.0.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"="Prefer not to answer","Do not know"="Do not know"))
data.geno.pheno$f.4803.1.0_recode <- revalue(data.geno.pheno$f.4803.1.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"="Prefer not to answer","Do not know"="Do not know"))
data.geno.pheno$f.4803.2.0_recode <- revalue(data.geno.pheno$f.4803.2.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"="Prefer not to answer","Do not know"="Do not know"))
data.geno.pheno$f.4803.3.0_recode <- revalue(data.geno.pheno$f.4803.3.0, c("Yes, now most or all of the time"="Yes", "Yes, now a lot of the time"="Yes", "Yes, now some of the time"="Yes", "Yes, but not now, but have in the past"="Yes","No, never"="No","Prefer not to answer"="Prefer not to answer","Do not know"="Do not know"))

#Filtering based on multiple conditions individuals that become cases

data.geno.pheno$cases <- with(data.geno.pheno, ifelse(f.4803.0.0_recode == "No" & (f.4803.1.0_recode == "Yes" | f.4803.2.0_recode == "Yes" | f.4803.3.0_recode == "Yes")
                                                              & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode == "Yes" & f.4803.2.0_recode  %in% c("No", "Do not know") & f.4803.3.0_recode %in% c("No", "Do not know",NA)) 
                                                              & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode %in% c("No", "Do not know") & f.4803.2.0_recode == "Yes" & f.4803.3.0_recode %in% c("No", "Do not know"))
                                                              & !(f.4803.0.0_recode == "No" & f.4803.1.0_recode == "Yes" & f.4803.2.0_recode == "Yes" & f.4803.3.0_recode %in% c("No", "Do not know"))
                                                              | (f.4803.0.0_recode %in% c("Yes",NA) & (f.4803.1.0_recode %in% c("Yes",NA) | f.4803.2.0_recode %in% c("Yes",NA) | f.4803.3.0_recode %in% c("Yes",NA))
                                                                 & !(f.4803.0.0_recode %in% c("Yes",NA) & (f.4803.1.0_recode %in% c("No", "Do not know") | f.4803.2.0_recode %in% c("No", "Do not know") | f.4803.3.0_recode %in% c("No", "Do not know")))
                                                                 & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA))),
                                                              "Yes", NA))
data.geno.pheno$controls <- with(data.geno.pheno, ifelse(f.4803.0.0_recode %in% c("No",NA) & f.4803.1.0_recode %in% c("No", NA) & f.4803.2.0_recode %in% c("No",NA) & f.4803.3.0_recode %in% c("No",NA)
                                                                 & !(f.4803.0.0_recode %in% c(NA) & f.4803.1.0_recode %in% c(NA) & f.4803.2.0_recode %in% c(NA) & f.4803.3.0_recode %in% c(NA)),"No", NA))

# Xreates a column with the binary status for tinnitus of the individuals
data.geno.pheno$tinnitus <- coalesce(data.geno.pheno$cases, data.geno.pheno$controls)

# Select the age 
# For cases the first time they said yes
# For controls the last time they said no
                                 
agecases = data.geno.pheno %>% 
  filter(!is.na(tinnitus)) %>%
  select(IID, f.4803.0.0_recode, f.4803.1.0_recode, f.4803.2.0_recode, f.4803.3.0_recode, f.21003.0.0, f.21003.1.0, f.21003.2.0, f.21003.3.0) 
  
agecases = tibble::as_tibble(newdb)
head(newdb)
agecases$visit_idx = apply(newdb, 1, function(x) which(x == 'Yes'))
offset = which(colnames(newdb) == 'f.21003.0.0') - which(colnames(newdb) == 'f.4803.0.0_recode')
offset
newdb$age_final = apply(newdb, 1, function(x) min(unlist(x[x$visit_idx + offset])))
#FIXME: get in one variable the age of the cases and controls



In [None]:
# Get the number of the column

WORD="f.4803.0.0"; head -n1 ukb42495.tab | tr "\t" "\n" | grep -n $WORD
