
# Define Tinnitus

In this notebook, we generate a phenotype definition for tinnitus (Data-Field 4803 -tinnitus) in the UKB based on the subset of white European subjects used in the analysis of age related hearing loss. The goal is to use the imputed and genotype data for the assocciation test. Age and sex as a covariate and two PCAs (will need to recalculate for the individuals used in the analysis). Please try to use the lowest allele frequency possible in the analysis.

# Cases and controls are defined based on two scenarios

## Scenario 1: Use entire (cases and controls) sample used for the hearing loss analysis
Use the entire sample of controls and cases of white Europeans which we analyzed for last publication and from this group of individuals only select those individuals who answered No - never had tinnitus at all assessments.
## Scenario 2: use only controls used for the hearing loss analysis

### tinnitus questionnaire in UKB
ACE touchscreen question "Do you get or have you had noises (such as ringing or buzzing) in your head or in one or both ears that lasts for more than five minutes at a time?"

# Read the data in the database

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
with open("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [3]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [4]:
print(datetime.now())

2022-09-24 17:42:09.705098


In [4]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,1960,November,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000022,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000035,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000046,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000054,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,6025409,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,6025411,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,6025425,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [70]:
# Read the data used for hearing loss analysis
HL_pheno = pd.read_csv('~/project_bst/tinnitus/20220923_UKBB_HL_expandedwhite_178906cases_237318ctrl',sep='\t', dtype="string")
HL_pheno = HL_pheno[["IID","sex","hearing_imp_pure_ctrl"]]
HL_pheno

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl
0,1000022,0,1
1,1000035,0,1
2,1000046,1,1
3,1000054,1,1
4,1000063,0,0
...,...,...,...
416219,6025390,1,1
416220,6025409,1,0
416221,6025411,1,0
416222,6025425,1,0


In [71]:
#subset the data only for individuals used in the hearing loss analysis
filtered = HL_pheno.merge(df, on = 'IID',how='left')

In [72]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [77]:
filtered.to_csv('~/project_bst/tinnitus/092821_UKBB_486416ind_call90_filtered_tinnitus.csv',sep='\t', index=False)

## Remove inconsistencies or unclear individuals

###  Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': 76141,
 'Yes, but not now, but have in the past': 11400,
 'Yes, now some of the time': 9788,
 'Yes, now a lot of the time': 2973,
 'Yes, now most or all of the time': 7426,
 'Do not know': 1745,
 'Prefer not to answer': 127}

### Inconsistencies in the tinnitus answers

In [73]:
tin_cols = [col for col in filtered if "f.4803" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [74]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [75]:
#this builds the options set to contain a set of all the unique answers the individuals in the database have had

tin_qs = filtered[tin_cols]
s = tin_qs.apply(find_options, axis=1)

In [76]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '001',
 '0010',
 '0011',
 '009',
 '01',
 '010',
 '0100',
 '0101',
 '011',
 '0111',
 '019',
 '09',
 '090',
 '091',
 '099',
 '1',
 '10',
 '100',
 '1000',
 '101',
 '1010',
 '1011',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '119',
 '19',
 '190',
 '191',
 '1911',
 '9',
 '90',
 '900',
 '901',
 '9011',
 '909',
 '91',
 '911',
 '99',
 '990',
 '991'}

In [78]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removing the answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [79]:
inconsistent

['909',
 '009',
 '09',
 '099',
 '090',
 '191',
 '19',
 '1911',
 '119',
 '101',
 '1010',
 '010',
 '100',
 '0101',
 '0010',
 '1101',
 '1100',
 '1011',
 '1110',
 '1000',
 '110',
 '0100',
 '10',
 '019',
 '190']

####  Filtering out the data

In [80]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [81]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [82]:
exclude = filtered[tin_cols].apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [83]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Identify Pure Control

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all tinnitus). However these individuals can still be part of the cases

In [84]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return False
    return True

In [85]:
f4803_ctrl = filtered[tin_cols].apply(find_ctrl, axis=1)

In [86]:
sum(f4803_ctrl)

299451

# Filter out exclusions from the full database

If individuals have certain codes from ICD 10, ICD 9, and self-reports they must be fully removed from the analysis. 

In [93]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

In [88]:
# csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32.0,N,N,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218.0,N,N,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49.0,N,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218.0,Y,,,,,,,
567,f.20002,1583 ischaemic stroke,44.0,N,N,,,,,,
568,f.20002,1082 transient ischaemic attack (tia),2243.0,N,N,,,,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212.0,Y,,,,,,,


### Collect ICD 10 codes to filter out from Ctrl

In [89]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24.0,N,Y,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51.0,N,Y,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33.0,N,Y,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408.0,N,Y,,,,N,,


In [90]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H931',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [91]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,F101,J342,R619,S8280,W010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,Z538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,E780,G473,R065,R074,Z824,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,M4782,M5499,M7989,N133,N179,N200,N201,N209,N390,N820,R42,R798,S7200,T831,W010,Y831,Y95,Z089,Z510,Z511,Z513,Z530,Z855,Z871,Z907,Z936,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,M060,M069,M179,M199,M2550,M819,R104,R11,R13,R410,R509,R590,R619,R634,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,O149,O266,O342,O471,O48,O610,O680,Z370,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,G551,M501,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [94]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### Collect ICD 9 codes to filter out from Ctrl

In [95]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
371,f.41271,3880 Degenerative and vascular disorders of ear,0.0,N,Y,,,,,,
372,f.41271,3881 Noise effects on inner ear,0.0,N,Y,,,,,,
373,f.41271,"3882 Sudden hearing loss, unspecified",0.0,N,Y,,,,,,
374,f.41271,3883 Tinnitus,11.0,N,Y,,,,,,
375,f.41271,3884 Other abnormal auditory perception,0.0,N,Y,,,,,,
379,f.41271,3888 Other specified disorders of ear,1.0,N,Y,,,,,,
380,f.41271,"3889 Disorders of ear, unspecified",2.0,N,Y,,,,,,
383,f.41271,3891 Sensorineural deafness,6.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
384,f.41271,3892 Mixed conductive and sensorineural deafness,1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
385,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,


In [96]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3883',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [97]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,f.41271.0.10,f.41271.0.11,f.41271.0.12,f.41271.0.13,f.41271.0.14,f.41271.0.15,f.41271.0.16,f.41271.0.17,f.41271.0.18,f.41271.0.19,f.41271.0.20,f.41271.0.21,f.41271.0.22,f.41271.0.23,f.41271.0.24,f.41271.0.25,f.41271.0.26,f.41271.0.27,f.41271.0.28,f.41271.0.29,f.41271.0.30,f.41271.0.31,f.41271.0.32,f.41271.0.33,f.41271.0.34,f.41271.0.35,f.41271.0.36,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3000,5198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [98]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### Collect f20002 codes to filter out from Ctrl

In [103]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = [col for col in filtered if "f.20002" in col]

In [104]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,f.20002.1.1,f.20002.1.2,f.20002.1.3,f.20002.1.4,f.20002.1.5,...,f.20002.2.28,f.20002.2.29,f.20002.2.30,f.20002.2.31,f.20002.2.32,f.20002.2.33,f.20002.3.0,f.20002.3.1,f.20002.3.2,f.20002.3.3,f.20002.3.4,f.20002.3.5,f.20002.3.6,f.20002.3.7,f.20002.3.8,f.20002.3.9,f.20002.3.10,f.20002.3.11,f.20002.3.12,f.20002.3.13,f.20002.3.14,f.20002.3.15,f.20002.3.16,f.20002.3.17,f.20002.3.18,f.20002.3.19,f.20002.3.20,f.20002.3.21,f.20002.3.22,f.20002.3.23,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416220,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,1478,1473,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416221,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
416222,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [105]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
543,f.20002,1597 tinnitus / tiniitis,1950.0,N,Y,,,,,,


In [106]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

['1597']

In [107]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### Collect individuals with other tinnitus codes to filter out from CTRL

In [108]:
# check if the given code exists in the individuals
def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [109]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered[icd_10_cols].apply(tinn_icd10_check_code, axis = 1)

In [110]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered[icd_9_cols].apply(tinn_icd9_check_code, axis = 1)

In [111]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered[self_report_cols].apply(tinn_self_report_check_code, axis = 1)

### Filter out Tinnitus Ctrl

In [112]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report)

300754

In [113]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [114]:
filtered["tinnitus_pure_ctrl"] = filtered_ctrl

In [115]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,tinnitus_pure_ctrl
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1


In [117]:
filtered["tinnitus_pure_ctrl"].value_counts()

1    300754
0    114032
Name: tinnitus_pure_ctrl, dtype: int64

## Identify Age

In [118]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [119]:
#get the minimum age of each individual in the given columns
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA

In [120]:
filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)

In [121]:
saved_tinn = filtered

In [122]:
filtered = saved_tinn

## Noisy workplace and Loud Music Variable

Two variables that we need to control for in the analysis are f.4825 (noisy workplace) and f.4836 (loud music).

### Check for inconsistencies

<b>f.4825 "Have you ever worked in a noisy place where you had to shout to be heard?"</b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

<b>f.4836 "Have you ever listened to music for more than 3 hours per week at a volume which you would need to shout to be heard or, if wearing headphones, someone else would need to shout for you to hear them?" </b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

In [124]:
noise_wp_cols = [col for col in df if "f.4825" in col]
loud_music_cols = [col for col in df if "f.4836" in col]

In [125]:
noise_loud_answers = {"No":0, "Yes, for less than a year":1, "Yes, for around 1-5 years":2, "Yes, for more than 5 years":3}

# if the answers we have are not sorted in the order that they're in the list, then that individual is inconsistent
def find_inconsistencies_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()] 
    return sorted(ans) != ans
        

In [126]:
exclude = filtered[noise_wp_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

In [127]:
exclude = filtered[loud_music_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

### Label Noise and Loud Music

In [128]:
# find individuals that have yes for either noise or loud sounds
def find_label_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()]
    if len(ans) > 0:
        return ans[-1]
    return pd.NA

In [129]:
filtered["noise_wp"] = filtered[noise_wp_cols].apply(find_label_noisy_loud, axis=1)

In [130]:
filtered["loud_music"] = filtered[loud_music_cols].apply(find_label_noisy_loud, axis=1)

In [133]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,tinnitus_pure_ctrl,tinnitus_age,noise_wp,loud_music
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,53,,
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,63,,
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,62,,
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,65,,
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,43,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,67,,
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,61,,
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,49,,
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,44,,


## Identify Cases

**Analysis plan:**

1. Individuals who currently have tinnitus (all four yes categories) vs no never (No tinnitus vs anytype of tinnitus). 

2. Individuals in the three top YES categories vs NO never ('No' tinnitus vs 'Yes, now most of the time or all of the time', 'Yes, now a lot of the time' and 'Yes, now some of the time')

3. Individuals that say yes in the top two categories vs No never ('No' tinnitus vs 'Yes, now most of the time or all of the time' and  'Yes, now a lot of the time')


In [134]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i != "Prefer not to answer" and i in tin_ans.keys() and tin_ans[i] == 1:
            return 1
    return 0

### Case group 1

In [135]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [136]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [137]:
filtered["tinnitus_1"] = tinn_yes

### Case group 2

In [138]:
tin_ans = {"Do not know":9, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [139]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [140]:
filtered["tinnitus_2"] = tinn_yes

### Case group 3

In [141]:
tin_ans = {"Do not know":9, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [142]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [143]:
filtered["tinnitus_3"] = tinn_yes

In [144]:
filtered

Unnamed: 0,IID,sex,hearing_imp_pure_ctrl,FID,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,tinnitus_pure_ctrl,tinnitus_age,noise_wp,loud_music,tinnitus_1,tinnitus_2,tinnitus_3
0,1000022,0,1,1000022,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,53,,,0,0,0
1,1000035,0,1,1000035,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,63,,,0,0,0
2,1000046,1,1,1000046,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,62,,,0,0,0
3,1000054,1,1,1000054,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,65,,,0,0,0
4,1000063,0,0,1000063,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,43,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416219,6025390,1,1,6025390,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,67,,,0,0,0
416220,6025409,1,0,6025409,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,61,,,0,0,0
416221,6025411,1,0,6025411,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,49,,,0,0,0
416222,6025425,1,0,6025425,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,44,,,0,0,0


In [145]:
sum(filtered["tinnitus_1"])

48513

In [146]:
sum(filtered["tinnitus_2"])

31437

In [147]:
sum(filtered["tinnitus_3"])

16441

## File Output

In [149]:
filtered[["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]].to_csv("~/project_bst/tinnitus/tinnitus_pheno_file.tsv", sep='\t', index=False)

In [231]:
tinnitus_pheno = filtered[["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]]

In [None]:
# control definition need to be recode : 0 pure controls else 1

In [232]:
tinnitus_pheno["tinnitus_pure_ctrl"] = tinnitus_pheno["tinnitus_pure_ctrl"].apply(lambda x: 1 if x == 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [235]:
tinnitus_pheno_all_HL_samples = tinnitus_pheno

In [236]:
tinnitus_pheno_only_HL_controls = tinnitus_pheno[tinnitus_pheno["hearing_imp_pure_ctrl"] == "0"][["FID", "IID", "sex", "tinnitus_age","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]]

In [237]:
tinnitus_pheno_only_HL_controls

Unnamed: 0,FID,IID,sex,tinnitus_age,tinnitus_pure_ctrl,tinnitus_1,tinnitus_2,tinnitus_3
4,1000063,1000063,0,43,1,0,0,0
5,1000078,1000078,1,52,1,0,0,0
6,1000081,1000081,0,67,1,0,0,0
10,1000129,1000129,0,62,0,0,0,0
11,1000137,1000137,1,46,0,0,0,0
...,...,...,...,...,...,...,...,...
416217,6025363,6025363,0,64,0,0,0,0
416220,6025409,6025409,1,61,1,0,0,0
416221,6025411,6025411,1,49,1,0,0,0
416222,6025425,6025425,1,44,0,0,0,0


In [238]:
tinnitus_pheno

Unnamed: 0,FID,IID,sex,tinnitus_age,hearing_imp_pure_ctrl,tinnitus_pure_ctrl,tinnitus_1,tinnitus_2,tinnitus_3
0,1000022,1000022,0,53,1,0,0,0,0
1,1000035,1000035,0,63,1,0,0,0,0
2,1000046,1000046,1,62,1,1,0,0,0
3,1000054,1000054,1,65,1,0,0,0,0
4,1000063,1000063,0,43,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
416219,6025390,6025390,1,67,1,1,0,0,0
416220,6025409,6025409,1,61,0,1,0,0,0
416221,6025411,6025411,1,49,0,1,0,0,0
416222,6025425,6025425,1,44,0,0,0,0,0


In [239]:
tinnitus_pheno[["tinnitus_pure_ctrl","tinnitus_1","tinnitus_2","tinnitus_3"]].value_counts()

tinnitus_pure_ctrl  tinnitus_1  tinnitus_2  tinnitus_3
0                   0           0           0             252241
1                   0           0           0             114032
0                   1           0           0              17076
                                1           1              16441
                                            0              14996
dtype: int64

In [240]:
tinnitus_pheno[["tinnitus_pure_ctrl"]].value_counts()

tinnitus_pure_ctrl
0                     300754
1                     114032
dtype: int64

In [241]:
tinnitus_pheno[["tinnitus_1"]].value_counts()

tinnitus_1
0             366273
1              48513
dtype: int64

In [242]:
tinnitus_pheno[["tinnitus_2"]].value_counts()

tinnitus_2
0             383349
1              31437
dtype: int64

In [243]:
tinnitus_pheno[["tinnitus_3"]].value_counts()

tinnitus_3
0             398345
1              16441
dtype: int64

In [244]:
tinnitus_pheno_only_HL_controls[["tinnitus_pure_ctrl","tinnitus_1","tinnitus_2","tinnitus_3"]].value_counts()

tinnitus_pure_ctrl  tinnitus_1  tinnitus_2  tinnitus_3
0                   0           0           0             145939
1                   0           0           0              73509
0                   1           0           0               7992
                                1           0               5696
                                            1               3511
dtype: int64

In [245]:
tinnitus_pheno_only_HL_controls[["tinnitus_pure_ctrl"]].value_counts()

tinnitus_pure_ctrl
0                     163138
1                      73509
dtype: int64

In [246]:
tinnitus_pheno_only_HL_controls[["tinnitus_1"]].value_counts()

tinnitus_1
0             219448
1              17199
dtype: int64

In [247]:
tinnitus_pheno_only_HL_controls[["tinnitus_2"]].value_counts()

tinnitus_2
0             227440
1               9207
dtype: int64

In [248]:
tinnitus_pheno_only_HL_controls[["tinnitus_3"]].value_counts()

tinnitus_3
0             233136
1               3511
dtype: int64

In [None]:
#Save output for all samples

In [249]:
tinnitus_pheno.to_csv("~/project_bst/tinnitus/tinnitus_pheno_all_HL_samples_20220925.tsv", sep='\t', index=False)

In [269]:
tinnitus_pheno[tinnitus_pheno["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_pure_ctrl"]].to_csv("tinnitus_pure_ctrl_pheno.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_1"] == 1][["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_1"]].to_csv("tinnitus_group1_pheno.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_2"] == 1][["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_2"]].to_csv("tinnitus_group2_pheno.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_3"] == 1][["FID", "IID", "sex", "tinnitus_age","hearing_imp_pure_ctrl","tinnitus_3"]].to_csv("tinnitus_group3_pheno.tsv", sep='\t', index=False)

In [251]:
#Save output for only controls of the HL analysis

In [252]:
tinnitus_pheno_only_HL_controls.to_csv("~/project_bst/tinnitus/tinnitus_pheno_only_HL_controls_20220925.tsv", sep='\t', index=False)

In [271]:
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_age","tinnitus_pure_ctrl"]].to_csv("tinnitus_pure_ctrl_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_1"] == 1][["FID", "IID", "sex", "tinnitus_age","tinnitus_1"]].to_csv("tinnitus_group1_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_2"] == 1][["FID", "IID", "sex", "tinnitus_age","tinnitus_2"]].to_csv("tinnitus_group2_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_3"] == 1][["FID", "IID", "sex", "tinnitus_age","tinnitus_3"]].to_csv("tinnitus_group3_pheno_controls.tsv", sep='\t', index=False)

## Merge Pheno with Ctrl

### All samples

In [254]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno.tsv"
tinnitus_1_file_name = "tinnitus_group1_pheno.tsv"
tinnitus_2_file_name = "tinnitus_group2_pheno.tsv"
tinnitus_3_file_name = "tinnitus_group3_pheno.tsv"

In [255]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t")
tinnitus_group1 = pd.read_csv(tinnitus_1_file_name, sep="\t")
tinnitus_group2 = pd.read_csv(tinnitus_2_file_name, sep="\t")
tinnitus_group3 = pd.read_csv(tinnitus_3_file_name, sep="\t")

In [256]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:"age"})
tinnitus_group1 = tinnitus_group1.rename(columns={tinnitus_group1.columns[3]:"age"})
tinnitus_group2 = tinnitus_group2.rename(columns={tinnitus_group2.columns[3]:"age"})
tinnitus_group3 = tinnitus_group3.rename(columns={tinnitus_group3.columns[3]:"age"})

In [257]:
tinnitus_ctrl

Unnamed: 0,FID,IID,sex,age,hearing_imp_pure_ctrl,tinnitus_pure_ctrl
0,1000022,1000022,0,53,1,0
1,1000035,1000035,0,63,1,0
2,1000054,1000054,1,65,1,0
3,1000105,1000105,1,54,1,0
4,1000129,1000129,0,62,0,0
...,...,...,...,...,...,...
300749,6025335,6025335,0,62,0,0
300750,6025346,6025346,1,53,0,0
300751,6025363,6025363,0,64,0,0
300752,6025425,6025425,1,44,0,0


In [258]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[5]:tinnitus_group1.columns[5]})
full_tinnitus_group1 = tinnitus_group1.append(tinnitus_ctrl)
full_tinnitus_group1



Unnamed: 0,FID,IID,sex,age,hearing_imp_pure_ctrl,tinnitus_1
0,1000105,1000105,1,54,1,1
1,1000331,1000331,1,53,0,1
2,1000439,1000439,1,59,0,1
3,1000494,1000494,0,61,1,1
4,1000728,1000728,0,61,1,1
...,...,...,...,...,...,...
300749,6025335,6025335,0,62,0,0
300750,6025346,6025346,1,53,0,0
300751,6025363,6025363,0,64,0,0
300752,6025425,6025425,1,44,0,0


In [259]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[5]:tinnitus_group2.columns[5]})
full_tinnitus_group2 = tinnitus_group2.append(tinnitus_ctrl)
full_tinnitus_group2



Unnamed: 0,FID,IID,sex,age,hearing_imp_pure_ctrl,tinnitus_2
0,1000439,1000439,1,59,0,1
1,1000728,1000728,0,61,1,1
2,1001045,1001045,0,61,1,1
3,1001052,1001052,1,64,1,1
4,1001067,1001067,0,50,1,1
...,...,...,...,...,...,...
300749,6025335,6025335,0,62,0,0
300750,6025346,6025346,1,53,0,0
300751,6025363,6025363,0,64,0,0
300752,6025425,6025425,1,44,0,0


In [260]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[5]:tinnitus_group3.columns[5]})
full_tinnitus_group3 = tinnitus_group3.append(tinnitus_ctrl)
full_tinnitus_group3



Unnamed: 0,FID,IID,sex,age,hearing_imp_pure_ctrl,tinnitus_3
0,1000728,1000728,0,61,1,1
1,1001123,1001123,1,62,1,1
2,1001162,1001162,1,67,0,1
3,1001929,1001929,0,51,1,1
4,1002859,1002859,0,61,0,1
...,...,...,...,...,...,...
300749,6025335,6025335,0,62,0,0
300750,6025346,6025346,1,53,0,0
300751,6025363,6025363,0,64,0,0
300752,6025425,6025425,1,44,0,0


In [261]:
full_tinnitus_group1[["tinnitus_1"]].value_counts()

tinnitus_1
0             300754
1              48513
dtype: int64

In [262]:
full_tinnitus_group2[["tinnitus_2"]].value_counts()

tinnitus_2
0             300754
1              31437
dtype: int64

In [263]:
full_tinnitus_group3[["tinnitus_3"]].value_counts()

tinnitus_3
0             300754
1              16441
dtype: int64

In [264]:
full_tinnitus_group1.to_csv("~/project_bst/tinnitus/tinnitus_case_group1_all_samples_48513cases_300754controls.tsv", sep='\t', index=False)
full_tinnitus_group2.to_csv("~/project_bst/tinnitus/tinnitus_case_group2_all_samples_31437cases_300754controls.tsv", sep='\t', index=False)
full_tinnitus_group3.to_csv("~/project_bst/tinnitus/tinnitus_case_group3_all_samples_16441cases_300754controls.tsv", sep='\t', index=False)

### Only controls

In [272]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_controls.tsv"
tinnitus_1_file_name = "tinnitus_group1_pheno_controls.tsv"
tinnitus_2_file_name = "tinnitus_group2_pheno_controls.tsv"
tinnitus_3_file_name = "tinnitus_group3_pheno_controls.tsv"

In [273]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t")
tinnitus_group1 = pd.read_csv(tinnitus_1_file_name, sep="\t")
tinnitus_group2 = pd.read_csv(tinnitus_2_file_name, sep="\t")
tinnitus_group3 = pd.read_csv(tinnitus_3_file_name, sep="\t")

In [276]:
tinnitus_ctrl

Unnamed: 0,FID,IID,sex,age,tinnitus_pure_ctrl
0,1000129,1000129,0,62,0
1,1000137,1000137,1,46,0
2,1000186,1000186,0,53,0
3,1000224,1000224,1,58,0
4,1000287,1000287,1,60,0
...,...,...,...,...,...
163133,6025335,6025335,0,62,0
163134,6025346,6025346,1,53,0
163135,6025363,6025363,0,64,0
163136,6025425,6025425,1,44,0


In [278]:
tinnitus_group1

Unnamed: 0,FID,IID,sex,age,tinnitus_1
0,1000331,1000331,1,53,1
1,1000439,1000439,1,59,1
2,1000992,1000992,0,48,1
3,1001162,1001162,1,67,1
4,1001179,1001179,1,66,1
...,...,...,...,...,...
17194,6023379,6023379,1,52,1
17195,6024403,6024403,0,51,1
17196,6024812,6024812,0,56,1
17197,6024911,6024911,0,67,1


In [275]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:"age"})
tinnitus_group1 = tinnitus_group1.rename(columns={tinnitus_group1.columns[3]:"age"})
tinnitus_group2 = tinnitus_group2.rename(columns={tinnitus_group2.columns[3]:"age"})
tinnitus_group3 = tinnitus_group3.rename(columns={tinnitus_group3.columns[3]:"age"})

In [279]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group1.columns[4]})
full_tinnitus_group1 = tinnitus_group1.append(tinnitus_ctrl)
full_tinnitus_group1



Unnamed: 0,FID,IID,sex,age,tinnitus_1
0,1000331,1000331,1,53,1
1,1000439,1000439,1,59,1
2,1000992,1000992,0,48,1
3,1001162,1001162,1,67,1
4,1001179,1001179,1,66,1
...,...,...,...,...,...
163133,6025335,6025335,0,62,0
163134,6025346,6025346,1,53,0
163135,6025363,6025363,0,64,0
163136,6025425,6025425,1,44,0


In [280]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group2.columns[4]})
full_tinnitus_group2 = tinnitus_group2.append(tinnitus_ctrl)
full_tinnitus_group2



Unnamed: 0,FID,IID,sex,age,tinnitus_2
0,1000439,1000439,1,59,1
1,1001162,1001162,1,67,1
2,1001395,1001395,1,59,1
3,1001566,1001566,0,60,1
4,1001744,1001744,1,45,1
...,...,...,...,...,...
163133,6025335,6025335,0,62,0
163134,6025346,6025346,1,53,0
163135,6025363,6025363,0,64,0
163136,6025425,6025425,1,44,0


In [281]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group3.columns[4]})
full_tinnitus_group3 = tinnitus_group3.append(tinnitus_ctrl)
full_tinnitus_group3



Unnamed: 0,FID,IID,sex,age,tinnitus_3
0,1001162,1001162,1,67,1
1,1002859,1002859,0,61,1
2,1002926,1002926,0,69,1
3,1003431,1003431,1,61,1
4,1003485,1003485,1,54,1
...,...,...,...,...,...
163133,6025335,6025335,0,62,0
163134,6025346,6025346,1,53,0
163135,6025363,6025363,0,64,0
163136,6025425,6025425,1,44,0


In [282]:
full_tinnitus_group1[["tinnitus_1"]].value_counts()

tinnitus_1
0             163138
1              17199
dtype: int64

In [284]:
full_tinnitus_group2[["tinnitus_2"]].value_counts()

tinnitus_2
0             163138
1               9207
dtype: int64

In [285]:
full_tinnitus_group3[["tinnitus_3"]].value_counts()

tinnitus_3
0             163138
1               3511
dtype: int64

In [286]:
full_tinnitus_group1.to_csv("~/project_bst/tinnitus/tinnitus_case_group1_only_HLcontrols_17199cases_163138controls.tsv", sep='\t', index=False)
full_tinnitus_group2.to_csv("~/project_bst/tinnitus/tinnitus_case_group2_all_only_HLcontrols_9207cases_163138controls.tsv", sep='\t', index=False)
full_tinnitus_group3.to_csv("~/project_bst/tinnitus/tinnitus_case_group3_all_only_HLcontrols_3511cases_163138controls.tsv", sep='\t', index=False)

# Exclude subjects with vestibular disorder code 1415

In [None]:
## Read pheno files for HL from the latest analysis by Diana after excluding samples with code 1415 but without the sub-category code

In [None]:
exclusion = pd.read_csv("/home/gl2776/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/pleiotropy_AD_ARHI/with_1415_without_subcat.500k.sample_id.txt",header=0,sep="\t")
exclusion = exclusion["IID"].to_list()
len(exclusion)

4226