
# Define Tinnitus

In this notebook, we generate a phenotype definition for tinnitus (Data-Field 4803 -tinnitus) in the UKB based on the subset of white European subjects used in the analysis of age related hearing loss. The goal is to use the imputed and genotype data for the assocciation test. Age and sex as a covariate and two PCAs (will need to recalculate for the individuals used in the analysis). Please try to use the lowest allele frequency possible in the analysis.

## tinnitus questionnaire in UKB
ACE touchscreen question "Do you get or have you had noises (such as ringing or buzzing) in your head or in one or both ears that lasts for more than five minutes at a time?"

Possible answers were:
 1) Yes, now most or all of the time;
 2) Yes, now a lot of the time;
 3) Yes, now some of the time;
 4) Yes, but not now, but have in the past;
 5) No, never;
 6) Do not know; and
 7) Prefer not to answer.”
 
## Cases and controls are defined based on two scenarios

### Scenario 1: Use entire (cases and controls) sample used for the Age related hearing loss (ARHL) analysis
Use the entire sample of controls and cases of white Europeans which we analyzed for last publication and from this group of individuals only select those individuals who answered No - never had tinnitus at all assessments.
### Scenario 2: use only controls used for the Age related hearing loss (ARHL)  analysis

# Read the data 

## Read the data in the database

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
with open("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [15]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + ethnicity + reported_sex + genetic_sex + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [16]:
print(datetime.now())

2022-10-05 18:44:06.901849


In [17]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Read in individuals in genotype array QC

In [7]:
qc_individuals = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/white_europeans/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.fam", sep="\t", header=None)
qc_individuals

Unnamed: 0,0,1,2,3,4,5
0,1000019,1000019,0,0,2,-9
1,1000022,1000022,0,0,1,-9
2,1000035,1000035,0,0,1,-9
3,1000046,1000046,0,0,2,-9
4,1000054,1000054,0,0,2,-9
...,...,...,...,...,...,...
460644,6025390,6025390,0,0,2,-9
460645,6025409,6025409,0,0,2,-9
460646,6025411,6025411,0,0,2,-9
460647,6025425,6025425,0,0,2,-9


## Read in PCA outlier file

In [8]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/092821_PCA_related_pval0.005/ukb47922_white_460649ind.092821_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1003423,1003423
1,1008606,1008606
2,1009852,1009852
3,1010412,1010412
4,1010678,1010678
...,...,...
1377,5801962,5801962
1378,5807807,5807807
1379,5809112,5809112
1380,5833189,5833189


## Exclusion based on ICD9/10 (f.41271 and f.41270) and self_report tinnitus (f.20002)

In [5]:
filtered = pd.read_csv('UKB_whites_tinnitus_filtered.csv', sep='\t',dtype = 'string')

In [4]:
#csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32.0,N,N,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218.0,N,N,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49.0,N,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218.0,Y,,,,,,,
567,f.20002,1583 ischaemic stroke,44.0,N,N,,,,,,
568,f.20002,1082 transient ischaemic attack (tia),2243.0,N,N,,,,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212.0,Y,,,,,,,


# Sample QC

## Inconsistent sex

In [18]:
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]

In [19]:
# returns true only if 
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]

In [20]:
# exclusion based on inconsistent sex
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)

In [21]:
filtered = df[~ex_sex]

In [22]:
print(sum(ex_sex), "individuals removed because of inconsistency with the genetic and reported sex variables")

0 individuals removed because of inconsistency with the genetic and reported sex variables


In [23]:
print("Of these individuals", sum([1 for x in df[genetic_sex[0]].to_list() if pd.isna(x)]), "were NA for the genetic sex variable")

Of these individuals 0 were NA for the genetic sex variable


## Remove non-white individuals

In [24]:
# set of answers for the ethnicity question
set(filtered[ethnicity[0]].to_list()).union( set(filtered[ethnicity[1]].to_list()) , set(filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [25]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

In [26]:
filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)

In [27]:
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white" and row["ethnicity"] != "Any_other_white_background"

In [28]:
ex_non_white = filtered[["ethnicity"]].apply(find_non_white, axis=1)

In [29]:
filtered = filtered[~ex_non_white]

In [30]:
print(sum(ex_non_white), "individuals removed for being non-white")

25767 individuals removed for being non-white


In [31]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,...,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


## Only keep individuals that passed genotype array QC

In [32]:
qc_list = [str(i) for i in qc_individuals[0].to_list()]
def matches_qc_individuals(row):
    return row["FID"] in qc_list
filtered = filtered[filtered[["FID"]].apply(matches_qc_individuals, axis=1)]

In [33]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,...,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


## Remove PCA outliers from the full database

Remove the outlier individuals from the full database if there exists any.

In [34]:
# since the IID from the dataframe is in string the outlier ids have to be made into string as well
out_ids = [str(x) for x in outlier[0].to_list()] 

def find_outliers(row):
    return row["IID"] in out_ids

In [35]:
ex_pca_outliers = filtered[["IID", "FID"]].apply(find_outliers, axis=1)

In [36]:
filtered = filtered[~ex_pca_outliers]

In [37]:
print(sum(ex_pca_outliers), "individuals removed for being pca outliers")

1382 individuals removed for being pca outliers


In [38]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,...,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


In [6]:
filtered_only_whites = filtered
filtered = filtered_only_whites

## Filter out exclusions from the full database

If individuals have certain codes from ICD 10, ICD 9, and self-reports they must be fully removed from the analysis.

In [7]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

### Filter out ICD 10 exclusions

In [8]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in filtered if "f.41270" in col]

In [9]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,F101,J342,R619,S8280,W010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,Z538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,M4782,M5499,M7989,N133,N179,N200,N201,N209,N390,N820,R42,R798,S7200,T831,W010,Y831,Y95,Z089,Z510,Z511,Z513,Z530,Z855,Z871,Z907,Z936,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,M060,M069,M179,M199,M2550,M819,R104,R11,R13,R410,R509,R590,R619,R634,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459264,O149,O266,O342,O471,O48,O610,O680,Z370,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459265,G551,M501,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
# get rows from exclusion database that contian the codes that need to be removed for icd10
exclude_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
27,f.41270,H65.2 Chronic serous otitis media,103.0,Y,,,,,,,
28,f.41270,H65.3 Chronic mucoid otitis media,960.0,Y,,,,,,,
29,f.41270,H65.4 Other chronic nonsuppurative otitis media,158.0,Y,,,,,,,
30,f.41270,"H65.9 Nonsuppurative otitis media, unspecified",508.0,Y,,,,,,,
33,f.41270,H66.1 Chronic tubotympanic suppurative otitis ...,40.0,Y,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
276,f.41270,"S07.9 Crushing injury of head, part unspecified",1.0,Y,,,,,,,
279,f.41270,S08.1 Traumatic amputation of ear,13.0,Y,,,,,,,
280,f.41270,S08.8 Traumatic amputation of other parts of head,1.0,Y,,,,,,,
281,f.41270,S08.9 Traumatic amputation of unspecified part...,1.0,Y,,,,,,,


In [11]:
# get the icd10 codes that should be excluded from database
ex_critia_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd10["Phenotype"].tolist()]
ex_critia_icd10

['H652',
 'H653',
 'H654',
 'H659',
 'H661',
 'H662',
 'H663',
 'H664',
 'H669',
 'H680',
 'H701',
 'H702',
 'H708',
 'H709',
 'H71',
 'H731',
 'H738',
 'H739',
 'H740',
 'H741',
 'H742',
 'H743',
 'H748',
 'H749',
 'H750',
 'H758',
 'H800',
 'H801',
 'H802',
 'H808',
 'H809',
 'H810',
 'H830',
 'H831',
 'H832',
 'H900',
 'H901',
 'H902',
 'H910',
 'H933',
 'H940',
 'H948',
 'H950',
 'H951',
 'H958',
 'H959',
 'B020',
 'B021',
 'B022',
 'B023',
 'B027',
 'B028',
 'G000',
 'G001',
 'G002',
 'G003',
 'G008',
 'G009',
 'G01',
 'G020',
 'G021',
 'G028',
 'G030',
 'G031',
 'G032',
 'G038',
 'G039',
 'G040',
 'G041',
 'G042',
 'G048',
 'G049',
 'G050',
 'G051',
 'G052',
 'G058',
 'G060',
 'G061',
 'G062',
 'G07',
 'G08',
 'G09',
 'G510',
 'G511',
 'G512',
 'G513',
 'G514',
 'G518',
 'G519',
 'S0200',
 'S0201',
 'S0210',
 'S0211',
 'S0240',
 'S0241',
 'S0260',
 'S0261',
 'S0270',
 'S0271',
 'S0280',
 'S0281',
 'S0290',
 'S0291',
 'S045',
 'S046',
 'S049',
 'S0600',
 'S0601',
 'S0610',
 'S0611

In [12]:
# collect the individuals that should be excluded because of icd10
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [13]:
# remove them from the working database (which is now filtered. filtered_only_whites remains unchanged)
filtered = filtered[~ex_10]

In [14]:
print(sum(ex_10), "individuals removed because of icd10 codes")

12692 individuals removed because of icd10 codes


In [15]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
1,1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
2,2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
4,4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459264,486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459265,486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1


### Filter out ICD 9 exclusions

In [16]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [17]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,f.41271.0.10,f.41271.0.11,f.41271.0.12,f.41271.0.13,f.41271.0.14,f.41271.0.15,f.41271.0.16,f.41271.0.17,f.41271.0.18,f.41271.0.19,f.41271.0.20,f.41271.0.21,f.41271.0.22,f.41271.0.23,f.41271.0.24,f.41271.0.25,f.41271.0.26,f.41271.0.27,f.41271.0.28,f.41271.0.29,f.41271.0.30,f.41271.0.31,f.41271.0.32,f.41271.0.33,f.41271.0.34,f.41271.0.35,f.41271.0.36,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3000,5198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459264,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [18]:
# get rows from exclusion database that contian the codes that need to be removed for icd9
exclude_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
319,f.41271,3811 Chronic serous otitis media,8.0,Y,,,,,,,
320,f.41271,3812 Chronic mucoid otitis media,11.0,Y,,,,,,,
321,f.41271,3813 Other and unspecified chronic nonsuppurat...,3.0,Y,,,,,,,
322,f.41271,"3814 Nonsuppurative otitis media, not specifie...",19.0,Y,,,,,,,
323,f.41271,3815 Eustachian salpingitis,0.0,Y,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
516,f.41271,9050 Late effect of fracture of skull and face...,19.0,Y,,,,,,,
526,f.41271,"9259 Crushing injury of face, scalp and neck",2.0,Y,,,,,,,
532,f.41271,9514 Injury to facial nerve,0.0,Y,,,,,,,
533,f.41271,9515 Injury to acoustic nerve,1.0,Y,,,,,,,


In [19]:
# get the icd9 codes that should be excluded from the working database
ex_critia_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd9["Phenotype"].tolist()]
ex_critia_icd9

['3811',
 '3812',
 '3813',
 '3814',
 '3815',
 '3816',
 '3819',
 '3821',
 '3822',
 '3823',
 '3824',
 '3829',
 '3831',
 '3832',
 '3833',
 '3838',
 '3839',
 '3841',
 '3850',
 '3851',
 '3852',
 '3853',
 '3858',
 '3859',
 '3860',
 '3863',
 '3864',
 '3865',
 '3868',
 '3869',
 '3870',
 '3871',
 '3872',
 '3878',
 '3879',
 '3885',
 '3890',
 '0530',
 '0531',
 '0532',
 '0537',
 '0538',
 '3200',
 '3201',
 '3202',
 '3203',
 '3204',
 '3205',
 '3207',
 '3208',
 '3209',
 '3210',
 '3211',
 '3212',
 '3213',
 '3214',
 '3215',
 '3216',
 '3217',
 '3218',
 '3220',
 '3221',
 '3222',
 '3229',
 '3230',
 '3231',
 '3232',
 '3233',
 '3234',
 '3235',
 '3236',
 '3237',
 '3238',
 '3239',
 '3240',
 '3241',
 '3249',
 '3259',
 '3269',
 '3510',
 '3511',
 '3518',
 '3519',
 '8000',
 '8001',
 '8002',
 '8003',
 '8010',
 '8011',
 '8012',
 '8013',
 '8022',
 '8023',
 '8024',
 '8025',
 '8028',
 '8029',
 '8030',
 '8031',
 '8032',
 '8033',
 '8040',
 '8041',
 '8042',
 '8043',
 '8509',
 '8510',
 '8511',
 '8520',
 '8521',
 '8530',
 

In [20]:
# collect the individuals that should be excluded because of icd9
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [21]:
# remove them from the working database
filtered = filtered[~ex_9]

In [22]:
print(sum(ex_9), "individuals removed because of icd9 codes")

739 individuals removed because of icd9 codes


In [23]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
1,1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
2,2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
4,4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459264,486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459265,486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1


### Filter out f.20002 exclusion

In [24]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = [col for col in filtered if "f.20002" in col]

In [25]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,f.20002.1.1,f.20002.1.2,f.20002.1.3,f.20002.1.4,f.20002.1.5,...,f.20002.2.28,f.20002.2.29,f.20002.2.30,f.20002.2.31,f.20002.2.32,f.20002.2.33,f.20002.3.0,f.20002.3.1,f.20002.3.2,f.20002.3.3,f.20002.3.4,f.20002.3.5,f.20002.3.6,f.20002.3.7,f.20002.3.8,f.20002.3.9,f.20002.3.10,f.20002.3.11,f.20002.3.12,f.20002.3.13,f.20002.3.14,f.20002.3.15,f.20002.3.16,f.20002.3.17,f.20002.3.18,f.20002.3.19,f.20002.3.20,f.20002.3.21,f.20002.3.22,f.20002.3.23,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
1,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,1478,1473,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459264,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459265,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [26]:
# get rows from exclusion database that contian the codes that need to be removed for self-report
exclude_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
539,f.20002,1420 otosclerosis,260.0,Y,,,,,,,
540,f.20002,1421 meniere's disease,1553.0,Y,,,,,,,
541,f.20002,1499 labyrinthitis,417.0,Y,,,,,,,
545,f.20002,1244 infection of nervous system,55.0,Y,,,,,,,
546,f.20002,1245 brain abscess/intracranial abscess,79.0,Y,,,,,,,
547,f.20002,1246 encephalitis,348.0,Y,,,,,,,
548,f.20002,1247 meningitis,2214.0,Y,,,,,,,
550,f.20002,1249 cranial nerve problem/palsy,289.0,Y,,,,,,,
551,f.20002,1250 bell's palsy/facial nerve palsy,591.0,Y,,,,,,,
553,f.20002,1240 neurological injury/trauma,130.0,Y,,,,,,,


In [27]:
# get the self-report codes that should be excluded from the working database
ex_critia_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_f20002["Phenotype"].tolist()]
ex_critia_f20002

['1420',
 '1421',
 '1499',
 '1244',
 '1245',
 '1246',
 '1247',
 '1249',
 '1250',
 '1240',
 '1626',
 '1086',
 '1491',
 '1083',
 '1425']

In [28]:
# collect the individuals that should be excluded because of self-report
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [30]:
# remove them from the working database
filtered = filtered[~ex_f20002]



In [31]:
print(sum(ex_f20002), "individuals removed because of self-reported codes")

5571 individuals removed because of self-reported codes


In [32]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
1,1,1000022,1000022,Male,1954,August,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
2,2,1000035,1000035,Male,1944,May,,,,,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
4,4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459264,486413,6025411,6025411,Female,1960,November,"No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459265,486414,6025425,6025425,Female,1963,August,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1


## Identify Sex Column

In [42]:
# male is denoted a 0, female as 1
def find_sex(row):
    if row["f.31.0.0"] == "Male":
        return 0
    return 1

sex = filtered[["f.31.0.0"]].apply(find_sex, axis=1)

In [43]:
filtered["sex"] = sex

In [45]:
filtered = filtered.reset_index()

In [34]:
saved_filtered = filtered
filtered = saved_filtered

In [33]:
filtered.to_csv('UKB_440265whites_filtered.csv',sep='\t', index=False)

## Remove inconsistencies or unclear individuals

###  Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': ,
 'Yes, but not now, but have in the past': ,
 'Yes, now some of the time': ,
 'Yes, now a lot of the time': ,
 'Yes, now most or all of the time': ,
 'Do not know': ,
 'Prefer not to answer': }

In [35]:
#find the number of the samples who have NAs for tin_ans in all their visits
def find_exclusion(row):
    if len(row.dropna().to_list()) ==0:
        return True
    return False


In [36]:
NA_exclusion = filtered[tin_cols].apply(find_exclusion, axis=1)

In [37]:
sum(NA_exclusion)

256614

In [38]:
440265 - 256614

183651

In [39]:
filtered_tinnitus_answers = filtered.loc[NA_exclusion,tin_cols]

In [40]:
filtered_tinnitus_answers

Unnamed: 0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0
1,,,,
2,,,,
4,,,,
11,,,,
12,,,,
...,...,...,...,...
459257,,,,
459258,,,,
459260,,,,
459265,,,,


In [42]:
filtered_tinnitus = filtered[~NA_exclusion]

In [43]:
filtered_tinnitus

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
6,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
7,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
8,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459261,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1


In [44]:
filtered_tinnitus[tin_cols].stack(dropna=False,level=).value_counts(dropna=False)

<NA>                                      522028
No, never                                 147299
Yes, but not now, but have in the past     22202
Yes, now some of the time                  19161
Yes, now most or all of the time           14579
Yes, now a lot of the time                  5722
Do not know                                 3408
Prefer not to answer                         205
dtype: Int64

### Inconsistencies in the tinnitus answers

In [47]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [48]:
#this builds the options set to contain a set of all the unique answers the individuals in the database have had

tin_qs = filtered_tinnitus[tin_cols]
s = tin_qs.apply(find_options, axis=1)

In [49]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '001',
 '0010',
 '0011',
 '009',
 '01',
 '010',
 '0100',
 '0101',
 '0109',
 '011',
 '0110',
 '0111',
 '019',
 '09',
 '090',
 '091',
 '099',
 '1',
 '10',
 '100',
 '1000',
 '101',
 '1010',
 '1011',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '119',
 '19',
 '190',
 '191',
 '1911',
 '1919',
 '199',
 '9',
 '90',
 '900',
 '901',
 '9011',
 '909',
 '91',
 '911',
 '99',
 '990',
 '991',
 '999'}

In [50]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removing the answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [51]:
inconsistent

['909',
 '090',
 '09',
 '009',
 '099',
 '191',
 '199',
 '1919',
 '1911',
 '19',
 '119',
 '1110',
 '010',
 '0101',
 '1100',
 '110',
 '0100',
 '0110',
 '100',
 '1010',
 '1000',
 '101',
 '1101',
 '0010',
 '10',
 '1011',
 '019',
 '0109',
 '190']

In [52]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [53]:
exclude = filtered_tinnitus[tin_cols].apply(find_inconsistencies, axis=1)
filtered_tinnitus = filtered_tinnitus[~exclude]

In [54]:
print(sum(exclude), "individuals removed because of inconsistencies")

1959 individuals removed because of inconsistencies


In [55]:
filtered_tinnitus

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
6,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
7,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
8,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459261,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1


In [56]:
filtered_tinnitus[tin_cols].stack(dropna=True).value_counts(dropna=True)

No, never                                 145136
Yes, but not now, but have in the past     20953
Yes, now some of the time                  18527
Yes, now most or all of the time           14527
Yes, now a lot of the time                  5668
Do not know                                 3121
Prefer not to answer                         204
dtype: Int64

# Identify Pure Control

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent

We are also not including individuals to be part of the control group if they have certain codes for ICD9 (code 3883), ICD10 (code H931, or f.20002 (code 1597) (this is in the case that they say no to all tinnitus). However these individuals can still be part of the cases

In [57]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return False
    return True

In [58]:
f4803_ctrl = filtered_tinnitus[tin_cols].apply(find_ctrl, axis=1)

In [59]:
f4803_ctrl

3         False
5         False
6         False
7         False
8         False
          ...  
459259    False
459261    False
459262    False
459263    False
459264    False
Length: 181692, dtype: bool

In [60]:
sum(f4803_ctrl)

56581

In [61]:
181692-56581

125111

## Collect ICD 10 codes to filter out from Ctrl

In [62]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

In [63]:
# csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32.0,N,N,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218.0,N,N,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49.0,N,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218.0,Y,,,,,,,
567,f.20002,1583 ischaemic stroke,44.0,N,N,,,,,,
568,f.20002,1082 transient ischaemic attack (tia),2243.0,N,N,,,,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212.0,Y,,,,,,,


In [64]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24.0,N,Y,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51.0,N,Y,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33.0,N,Y,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408.0,N,Y,,,,N,,


In [65]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H931',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [66]:
icd10 = filtered_tinnitus[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
3,E780,G473,R065,R074,Z824,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,C19,C20,D037,D125,K635,L720,Z860,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,I842,I849,K409,K573,K620,K632,K638,K649,M171,M175,M8796,Z539,Z824,Z861,Z864,Z922,Z955,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,A099,E780,E785,G819,I10,I309,K219,K298,K317,K742,M109,M1317,M1917,M199,M1997,N840,N856,N950,R55,R943,S822,W180,Y86,Z133,Z470,Z721,Z864,Z866,Z867,Z981,Z993,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,A084,C435,C780,C787,C788,C795,C797,D259,D261,D485,M549,N832,R945,Z858,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459261,A099,D508,E101,E103,E109,E119,E162,F329,F799,F819,G119,H342,H360,I10,I209,I251,I259,I509,J069,J189,J22,K590,L031,L97,R101,R11,R296,R739,S519,S818,T793,V041,Z911,Z922,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459262,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,M060,M069,M179,M199,M2550,M819,R104,R11,R13,R410,R509,R590,R619,R634,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [67]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [69]:
sum(ex_10)

3857

## Collect ICD 9 codes to filter out from Ctrl

In [70]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
371,f.41271,3880 Degenerative and vascular disorders of ear,0.0,N,Y,,,,,,
372,f.41271,3881 Noise effects on inner ear,0.0,N,Y,,,,,,
373,f.41271,"3882 Sudden hearing loss, unspecified",0.0,N,Y,,,,,,
374,f.41271,3883 Tinnitus,11.0,N,Y,,,,,,
375,f.41271,3884 Other abnormal auditory perception,0.0,N,Y,,,,,,
379,f.41271,3888 Other specified disorders of ear,1.0,N,Y,,,,,,
380,f.41271,"3889 Disorders of ear, unspecified",2.0,N,Y,,,,,,
383,f.41271,3891 Sensorineural deafness,6.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
384,f.41271,3892 Mixed conductive and sensorineural deafness,1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
385,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,


In [71]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3883',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [72]:
icd9 = filtered_tinnitus[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,f.41271.0.10,f.41271.0.11,f.41271.0.12,f.41271.0.13,f.41271.0.14,f.41271.0.15,f.41271.0.16,f.41271.0.17,f.41271.0.18,f.41271.0.19,f.41271.0.20,f.41271.0.21,f.41271.0.22,f.41271.0.23,f.41271.0.24,f.41271.0.25,f.41271.0.26,f.41271.0.27,f.41271.0.28,f.41271.0.29,f.41271.0.30,f.41271.0.31,f.41271.0.32,f.41271.0.33,f.41271.0.34,f.41271.0.35,f.41271.0.36,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459261,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459262,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [73]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [74]:
sum(ex_9)

2

## Collect f20002 codes to filter out from Ctrl

In [75]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = filtered_tinnitus[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,f.20002.1.1,f.20002.1.2,f.20002.1.3,f.20002.1.4,f.20002.1.5,...,f.20002.2.28,f.20002.2.29,f.20002.2.30,f.20002.2.31,f.20002.2.32,f.20002.2.33,f.20002.3.0,f.20002.3.1,f.20002.3.2,f.20002.3.3,f.20002.3.4,f.20002.3.5,f.20002.3.6,f.20002.3.7,f.20002.3.8,f.20002.3.9,f.20002.3.10,f.20002.3.11,f.20002.3.12,f.20002.3.13,f.20002.3.14,f.20002.3.15,f.20002.3.16,f.20002.3.17,f.20002.3.18,f.20002.3.19,f.20002.3.20,f.20002.3.21,f.20002.3.22,f.20002.3.23,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
3,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459261,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459262,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
459263,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,1478,1473,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [76]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
543,f.20002,1597 tinnitus / tiniitis,1950.0,N,Y,,,,,,


In [77]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

['1597']

In [78]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [79]:
sum(ex_f20002)

1553

## Collect individuals with other tinnitus codes to filter out from CTRL

In [80]:
# check if the given code exists in the individuals
def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [81]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered_tinnitus[icd10_colnames].apply(tinn_icd10_check_code, axis = 1)

In [82]:
sum(tinn_icd10)

442

In [83]:
filtered_tinnitus["tinn_icd10"] = tinn_icd10

In [84]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered_tinnitus[icd9_colnames].apply(tinn_icd9_check_code, axis = 1)
filtered_tinnitus["tinn_icd9"] = tinn_icd9

In [85]:
sum(tinn_icd9)

0

In [86]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered_tinnitus[self_report_cols].apply(tinn_self_report_check_code, axis = 1)
filtered_tinnitus["tinn_self_report"] = tinn_self_report

In [87]:
sum(tinn_self_report)

1553

## Filter out Tinnitus Ctrl

In [88]:
sum(f4803_ctrl | tinn_icd10 | tinn_icd9 | tinn_self_report)

56700

In [89]:
temp = f4803_ctrl | tinn_icd10 | tinn_icd9 | tinn_self_report

In [90]:
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [91]:
filtered_tinnitus["tinnitus_pure_ctrl"] = filtered_ctrl

In [92]:
filtered_tinnitus["tinnitus_pure_ctrl"].value_counts()

0    124992
1     56700
Name: tinnitus_pure_ctrl, dtype: int64

In [None]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report)

58292

In [96]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]
filtered_tinnitus["tinnitus_pure_ctrl2"] = filtered_ctrl
filtered_tinnitus["tinnitus_pure_ctrl2"].value_counts()

0    123400
1     58292
Name: tinnitus_pure_ctrl2, dtype: int64

## Identify pure controls age

In [98]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [101]:
# get the latest time that an individual said no to any of the phenotypes
# return the oldest age that they were
def get_ctrl_age(row):
    phens = [tin_cols]
    ages = []
    if row["tinnitus_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i != "Prefer not to answer" and tin_ans[i] == 0:
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA



In [102]:
filtered_tinnitus["tinnitus_ctrl_age"] = filtered_tinnitus[ages_f21003_col +["tinnitus_pure_ctrl"] + tin_cols].apply(get_ctrl_age, axis=1)


In [103]:
# return the oldest age that they were
def get_ctrl_age(row):
    phens = [tin_cols]
    ages = []
    if row["tinnitus_pure_ctrl2"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i != "Prefer not to answer" and tin_ans[i] == 0:
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

In [104]:
filtered_tinnitus["tinnitus_ctrl2_age"] = filtered_tinnitus[ages_f21003_col +["tinnitus_pure_ctrl2"] + tin_cols].apply(get_ctrl_age, axis=1)

In [105]:
filtered_tinnitus

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,tinn_icd10,tinn_icd9,tinn_self_report,tinnitus_pure_ctrl,tinnitus_pure_ctrl2,tinnitus_ctrl_age,tinnitus_ctrl2_age
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,73,73
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,43,43
6,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,60,60
7,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,67,67
8,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1,0,0,0,0,0,64,64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,52,52
459261,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,42,42
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,67,67
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,66,66


# Identify Cases

**Analysis plan:**

1. Individuals who currently have tinnitus (all four yes categories) vs no never (No tinnitus vs anytype of tinnitus). 

* Yes now most of the time or all of the time
* Yes now a lot of the time
* Yes now some of the time
* Yes but not now but have in the past
* plus ICD10/self_report tinnitus

2. Individuals in the three top YES categories vs NO never 

* Yes now most of the time or all of the time
* Yes now a lot of the time
* Yes now some of the time


3. Individuals that say yes in the top two categories vs No never 

* Yes now most of the time or all of the time
* Yes now a lot of the time



In [106]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row[::-1]:
        if not pd.isna(i) and i != "Prefer not to answer":
            if i in tin_ans.keys() and tin_ans[i] == 1:
                return 1
    return 0

## Case group 1

In [107]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [108]:
tinn_yes = filtered_tinnitus[tin_cols].apply(find_yes, axis=1)

In [109]:
sum(tinn_yes)

53613

In [110]:
temp = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report
tinn_yes_all = [1 if i else 0 for i in temp.to_list()]

In [111]:
sum(tinn_yes_all)

53735

In [112]:
filtered_tinnitus["tinnitus_1"] = tinn_yes_all

In [92]:
filtered["tinnitus1_age"] = filtered[ages_f21003_col +["tinnitus_1"] + tin_cols].apply(get_phen_age, axis=1)

## Case group 2

In [113]:
tin_ans = {"Do not know":9, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [114]:
tinn_yes = filtered_tinnitus[tin_cols].apply(find_yes, axis=1)

In [115]:
sum(tinn_yes)

34817

In [116]:
filtered_tinnitus["tinnitus_2"] = tinn_yes

## Case group 3

In [117]:
tin_ans = {"Do not know":9, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [118]:
tinn_yes = filtered_tinnitus[tin_cols].apply(find_yes, axis=1)

In [119]:
sum(tinn_yes)

18112

In [120]:
filtered_tinnitus["tinnitus_3"] = tinn_yes

In [121]:
filtered_tinnitus

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,tinn_icd10,tinn_icd9,tinn_self_report,tinnitus_pure_ctrl,tinnitus_pure_ctrl2,tinnitus_ctrl_age,tinnitus_ctrl2_age,tinnitus_1,tinnitus_2,tinnitus_3
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,73,73,0,0,0
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,43,43,0,0,0
6,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,60,60,0,0,0
7,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,67,67,0,0,0
8,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1,0,0,0,0,0,64,64,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,52,52,0,0,0
459261,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,42,42,0,0,0
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,67,67,0,0,0
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,66,66,0,0,0


In [122]:
filtered_tinnitus.to_csv('tinnitus_124992purecontrols_53735tinnutus1_34817tinnitus2_18112tinnitus3.csv', sep='\t', index=False)

In [124]:
filtered_tinnitus[["tinnitus_pure_ctrl","tinnitus_1","tinnitus_2","tinnitus_3"]].value_counts()

tinnitus_pure_ctrl  tinnitus_1  tinnitus_2  tinnitus_3
0                   0           0           0             124992
1                   1           0           0              18918
                                1           1              18112
                                            0              16705
                    0           0           0               2965
dtype: int64

In [154]:
filtered_tinnitus_final = filtered_tinnitus[(filtered_tinnitus["tinnitus_pure_ctrl"] == 1) & (filtered_tinnitus["tinnitus_1"] == 0) & (filtered_tinnitus["tinnitus_2"] == 0) & (filtered_tinnitus["tinnitus_3"] == 0)]

In [155]:
filtered_tinnitus_final

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,tinn_icd10,tinn_icd9,tinn_self_report,tinnitus_pure_ctrl,tinnitus_pure_ctrl2,tinnitus_ctrl_age,tinnitus_ctrl2_age,tinnitus_1,tinnitus_2,tinnitus_3,tinnitus_yes_age
214,224,1002335,1002335,Female,1966,December,Do not know,,,,1349,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
262,277,1002888,1002888,Male,1940,November,Do not know,,,,1065,1111,1220,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,1,1,,,0,0,0,
342,363,1003769,1003769,Male,1966,September,Do not know,,,,1111,1312,1494,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,Any_other_white_background,0,0,0,0,1,1,,,0,0,0,
379,400,1004146,1004146,Female,1944,March,Do not know,,,,1065,1094,1111,1476,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
571,607,1006259,1006259,Female,1949,June,Do not know,,,,1111,1436,1474,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458135,485212,6013020,6013020,Female,1951,January,Do not know,,,,1522,1538,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
458407,485501,6015995,6015995,Female,1961,April,Do not know,,,,1111,1291,1534,1201,1226,1154,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
458507,485607,6017064,6017064,Female,1967,November,Do not know,,,,1265,1154,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,1,1,,,0,0,0,
458550,485654,6017538,6017538,Male,1945,September,Do not know,,,,1537,1502,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-10-04,British,0,0,0,0,1,1,,,0,0,0,


In [162]:
filtered_tinnitus_final[tin_cols].stack(dropna=True).value_counts(dropna=True)

Do not know             2841
Prefer not to answer     159
dtype: Int64

In [165]:
exclude = filtered_tinnitus_final["IID"].tolist()

In [166]:
tinnitus_final = filtered_tinnitus[~filtered_tinnitus['IID'].isin(exclude)]

In [167]:
tinnitus_final

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,tinn_icd10,tinn_icd9,tinn_self_report,tinnitus_pure_ctrl,tinnitus_pure_ctrl2,tinnitus_ctrl_age,tinnitus_ctrl2_age,tinnitus_1,tinnitus_2,tinnitus_3,tinnitus_yes_age
3,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,73,73,0,0,0,
5,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,43,43,0,0,0,
6,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,60,60,0,0,0,
7,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,67,67,0,0,0,
8,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1,0,0,0,0,0,64,64,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459259,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,52,52,0,0,0,
459261,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,42,42,0,0,0,
459262,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,67,67,0,0,0,
459263,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,66,66,0,0,0,


In [169]:
tinnitus_final[["tinnitus_pure_ctrl","tinnitus_1","tinnitus_2","tinnitus_3"]].value_counts()

tinnitus_pure_ctrl  tinnitus_1  tinnitus_2  tinnitus_3
0                   0           0           0             124992
1                   1           0           0              18918
                                1           1              18112
                                            0              16705
dtype: int64

In [171]:
tinnitus_final["tinnitus_pure_ctrl"].value_counts()

0    124992
1     53735
Name: tinnitus_pure_ctrl, dtype: int64

In [172]:
tinnitus_final["tinnitus_1"].value_counts()

0    124992
1     53735
Name: tinnitus_1, dtype: int64

In [173]:
tinnitus_final["tinnitus_2"].value_counts()

0    143910
1     34817
Name: tinnitus_2, dtype: int64

In [174]:
tinnitus_final["tinnitus_3"].value_counts()

0    160615
1     18112
Name: tinnitus_3, dtype: int64

## Merge with Age related hearing loss sample

In [179]:
# Read the data used for hearing loss analysis
ARHL_pheno = pd.read_csv('~/project_bst/tinnitus/All_HL_Cases_Controls_WhiteEuro.pheno',sep='\t', dtype="string")

In [182]:
ARHL_pheno = ARHL_pheno.rename(columns={ARHL_pheno.columns[3]:"ARHL_pheno"})
ARHL_pheno = ARHL_pheno[["IID","age","array","ARHL_pheno"]]
ARHL_pheno 

Unnamed: 0,IID,age,array,ARHL_pheno
0,1000022,53,1,1
1,1000090,64,1,1
2,1000112,68,1,1
3,1000170,51,2,1
4,1000198,41,1,1
...,...,...,...,...
416219,5941476,69,1,1
416220,5950956,77,1,1
416221,5958527,74,2,1
416222,5978696,73,1,1


In [185]:
ARHL_pheno["ARHL_pheno"].value_counts()

0    237318
1    178906
Name: ARHL_pheno, dtype: Int64

In [232]:
#merge the tinnitus data with individuals used in the hearing loss analysis
tinnitus_pheno = tinnitus_final.merge(ARHL_pheno, on = 'IID',how='inner')

In [233]:
tinnitus_pheno

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,...,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,tinn_icd10,tinn_icd9,tinn_self_report,tinnitus_pure_ctrl,tinnitus_pure_ctrl2,tinnitus_ctrl_age,tinnitus_ctrl2_age,tinnitus_1,tinnitus_2,tinnitus_3,tinnitus_yes_age,age,array,ARHL_pheno
0,3,1000046,1000046,Female,1946,March,,,"No, never",,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,73,73,0,0,0,,73,1,1
1,5,1000063,1000063,Male,1967,April,"No, never",,,,1387,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,43,43,0,0,0,,43,1,0
2,6,1000078,1000078,Female,1955,June,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,60,60,0,0,0,,60,1,0
3,7,1000081,1000081,Male,1942,February,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,67,67,0,0,0,,67,1,0
4,8,1000090,1000090,Female,1945,July,"No, never",,,,1082,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,Irish,1,0,0,0,0,0,64,64,0,0,0,,64,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163844,486407,6025354,6025354,Female,1957,February,"No, never",,,,1351,1446,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,52,52,0,0,0,,52,1,1
163845,486409,6025378,6025378,Male,1966,October,"No, never",,,,1065,1276,1222,1571,1569,1568,1630,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,0,0,0,0,0,0,42,42,0,0,0,,42,1,1
163846,486411,6025390,6025390,Female,1942,March,"No, never",,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,67,67,0,0,0,,67,1,1
163847,486412,6025409,6025409,Female,1946,November,,"No, never",,,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,British,1,0,0,0,0,0,66,66,0,0,0,,66,1,0


In [None]:
def find_exclude(row):
    for i in row[::-1]:
        if not pd.isna(i) and i != "Prefer not to answer":
            if i in tin_ans.keys() and tin_ans[i] == 1:
                return 1
    return 0

## Identify cases age

In [134]:
# get the earliest time that an individual said yes to having a phenotype
def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and tin_ans[i] == 1:
                return row[ages_f21003_col][en]
    else:
        return pd.NA

In [135]:
filtered_tinnitus["tinnitus_yes_age"] = filtered_tinnitus[ages_f21003_col + tin_cols].apply(get_phen_age, axis=1)

# File Output

## Tinnitus all

In [234]:
tinnitus_final.to_csv('~/project_bst/tinnitus/tinnitus_all_pheno_20221006.csv',sep='\t', index=False)

In [235]:
tinnitus_final[["FID", "IID", "sex","tinnitus_ctrl_age","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]].to_csv("tinnitus_all_pheno_20221006.csv", sep='\t', index=False)

In [236]:
tinnitus_final[tinnitus_final["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_ctrl_age","tinnitus_pure_ctrl"]].to_csv("tinnitus_pure_ctrl_all.tsv", sep='\t', index=False)
tinnitus_final[tinnitus_final["tinnitus_1"] == 1][["FID", "IID", "sex","tinnitus_ctrl_age","tinnitus_1"]].to_csv("tinnitus_group1_all.tsv", sep='\t', index=False)
tinnitus_final[tinnitus_final["tinnitus_2"] == 1][["FID", "IID", "sex","tinnitus_ctrl_age","tinnitus_2"]].to_csv("tinnitus_group2_all.tsv", sep='\t', index=False)
tinnitus_final[tinnitus_final["tinnitus_3"] == 1][["FID", "IID", "sex","tinnitus_ctrl_age","tinnitus_3"]].to_csv("tinnitus_group3_all.tsv", sep='\t', index=False)

## Tinnitus with all ARHL samples

In [237]:
tinnitus_pheno.to_csv('~/project_bst/tinnitus/tinnitus_pheno_all_ARHL_20221006.csv',sep='\t', index=False)
tinnitus_pheno[["FID", "IID", "sex","age","array","ARHL_pheno","tinnitus_ctrl_age","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]].to_csv("tinnitus_all_ARHL_pheno_20221006.csv", sep='\t', index=False)

In [238]:
tinnitus_pheno = tinnitus_pheno[["FID", "IID", "sex","age","array","ARHL_pheno","tinnitus_ctrl_age","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]]

In [239]:
tinnitus_pheno_all_HL_samples = tinnitus_pheno

In [240]:
tinnitus_pheno

Unnamed: 0,FID,IID,sex,age,array,ARHL_pheno,tinnitus_ctrl_age,tinnitus_pure_ctrl,tinnitus_1,tinnitus_2,tinnitus_3
0,1000046,1000046,1,73,1,1,73,0,0,0,0
1,1000063,1000063,0,43,1,0,43,0,0,0,0
2,1000078,1000078,1,60,1,0,60,0,0,0,0
3,1000081,1000081,0,67,1,0,67,0,0,0,0
4,1000090,1000090,1,64,1,1,64,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
163844,6025354,6025354,1,52,1,1,52,0,0,0,0
163845,6025378,6025378,0,42,1,1,42,0,0,0,0
163846,6025390,6025390,1,67,1,1,67,0,0,0,0
163847,6025409,6025409,1,66,1,0,66,0,0,0,0


In [250]:
tinnitus_pheno[["ARHL_pheno"]].value_counts()

ARHL_pheno
0             90708
1             73141
dtype: int64

In [242]:
tinnitus_pheno[["tinnitus_pure_ctrl"]].value_counts()

tinnitus_pure_ctrl
0                     115296
1                      48553
dtype: int64

In [243]:
tinnitus_pheno[["tinnitus_1"]].value_counts()

tinnitus_1
0             115296
1              48553
dtype: int64

In [244]:
tinnitus_pheno[["tinnitus_2"]].value_counts()

tinnitus_2
0             132412
1              31437
dtype: int64

In [245]:
tinnitus_pheno[["tinnitus_3"]].value_counts()

tinnitus_3
0             147408
1              16441
dtype: int64

In [246]:
tinnitus_pheno.to_csv("tinnitus_pheno_all_HL_samples_48553cases_115296controls.tsv", sep='\t', index=False)

In [247]:
tinnitus_pheno[tinnitus_pheno["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex","age","array","ARHL_pheno", "tinnitus_ctrl_age","tinnitus_pure_ctrl"]].to_csv("tinnitus_pure_ctrl_pheno_all_HL_samples.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_1"] == 1][["FID", "IID", "sex","age","array","ARHL_pheno","tinnitus_ctrl_age","tinnitus_1"]].to_csv("tinnitus_group1_pheno_all_HL_samples.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_2"] == 1][["FID", "IID", "sex", "age","array","ARHL_pheno","tinnitus_ctrl_age","tinnitus_2"]].to_csv("tinnitus_group2_pheno_all_HL_samples.tsv", sep='\t', index=False)
tinnitus_pheno[tinnitus_pheno["tinnitus_3"] == 1][["FID", "IID", "sex","age","array","ARHL_pheno","tinnitus_ctrl_age","tinnitus_3"]].to_csv("tinnitus_group3_pheno_all_HL_samples.tsv", sep='\t', index=False)

## Tinnitus with only ARHL control samples

In [248]:
tinnitus_pheno_only_HL_controls = tinnitus_pheno[tinnitus_pheno["ARHL_pheno"] == '0'][["FID", "IID", "sex","age","array","tinnitus_ctrl_age","tinnitus_pure_ctrl", "tinnitus_1", "tinnitus_2","tinnitus_3"]]

In [249]:
tinnitus_pheno_only_HL_controls

Unnamed: 0,FID,IID,sex,age,array,tinnitus_ctrl_age,tinnitus_pure_ctrl,tinnitus_1,tinnitus_2,tinnitus_3
1,1000063,1000063,0,43,1,43,0,0,0,0
2,1000078,1000078,1,60,1,60,0,0,0,0
3,1000081,1000081,0,67,1,67,0,0,0,0
7,1000141,1000141,1,49,1,49,0,0,0,0
8,1000236,1000236,0,70,1,70,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
163841,6025230,6025230,1,52,1,52,0,0,0,0
163842,6025307,6025307,1,55,2,,1,1,1,1
163843,6025319,6025319,1,56,1,56,0,0,0,0
163847,6025409,6025409,1,66,1,66,0,0,0,0


In [251]:
tinnitus_pheno_only_HL_controls[["tinnitus_pure_ctrl","tinnitus_1","tinnitus_2","tinnitus_3"]].value_counts()

tinnitus_pure_ctrl  tinnitus_1  tinnitus_2  tinnitus_3
0                   0           0           0             73509
1                   1           0           0              7992
                                1           0              5696
                                            1              3511
dtype: int64

In [252]:
tinnitus_pheno_only_HL_controls[["tinnitus_pure_ctrl"]].value_counts()

tinnitus_pure_ctrl
0                     73509
1                     17199
dtype: int64

In [253]:
tinnitus_pheno_only_HL_controls[["tinnitus_1"]].value_counts()

tinnitus_1
0             73509
1             17199
dtype: int64

In [254]:
tinnitus_pheno_only_HL_controls[["tinnitus_2"]].value_counts()

tinnitus_2
0             81501
1              9207
dtype: int64

In [255]:
tinnitus_pheno_only_HL_controls[["tinnitus_3"]].value_counts()

tinnitus_3
0             87197
1              3511
dtype: int64

In [256]:
tinnitus_pheno_only_HL_controls.to_csv("tinnitus_pheno_only_HL_controls_17199cases_73509controls.tsv", sep='\t', index=False)

In [257]:
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex","age","array","tinnitus_ctrl_age","tinnitus_pure_ctrl"]].to_csv("tinnitus_pure_ctrl_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_1"] == 1][["FID", "IID", "sex","age","array","tinnitus_ctrl_age","tinnitus_1"]].to_csv("tinnitus_group1_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_2"] == 1][["FID", "IID", "sex","age","array","tinnitus_ctrl_age","tinnitus_2"]].to_csv("tinnitus_group2_pheno_controls.tsv", sep='\t', index=False)
tinnitus_pheno_only_HL_controls[tinnitus_pheno_only_HL_controls["tinnitus_3"] == 1][["FID", "IID", "sex","age","array","tinnitus_ctrl_age","tinnitus_3"]].to_csv("tinnitus_group3_pheno_controls.tsv", sep='\t', index=False)

## Merge Pheno with Ctrl

### Tinnitus all samples

In [263]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_all.tsv"
tinnitus_1_file_name = "tinnitus_group1_all.tsv"
tinnitus_2_file_name = "tinnitus_group2_all.tsv"
tinnitus_3_file_name = "tinnitus_group3_all.tsv"

In [264]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t",dtype='string')
tinnitus_group1 = pd.read_csv(tinnitus_1_file_name, sep="\t",dtype='string')
tinnitus_group2 = pd.read_csv(tinnitus_2_file_name, sep="\t",dtype='string')
tinnitus_group3 = pd.read_csv(tinnitus_3_file_name, sep="\t",dtype='string')

In [265]:
tinnitus_ctrl

Unnamed: 0,FID,IID,sex,tinnitus_ctrl_age,tinnitus_pure_ctrl
0,1000046,1000046,1,73,0
1,1000063,1000063,0,43,0
2,1000078,1000078,1,60,0
3,1000081,1000081,0,67,0
4,1000090,1000090,1,64,0
...,...,...,...,...,...
124987,6025354,6025354,1,52,0
124988,6025378,6025378,0,42,0
124989,6025390,6025390,1,67,0
124990,6025409,6025409,1,66,0


In [266]:
tinnitus_group1

Unnamed: 0,FID,IID,sex,tinnitus_ctrl_age,tinnitus_1
0,1000105,1000105,1,,1
1,1000331,1000331,1,,1
2,1000439,1000439,1,,1
3,1000443,1000443,1,,1
4,1000494,1000494,0,,1
...,...,...,...,...,...
53730,6024620,6024620,0,,1
53731,6024671,6024671,0,,1
53732,6024812,6024812,0,,1
53733,6024911,6024911,0,,1


In [267]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group1.columns[4]})
full_tinnitus_group1 = tinnitus_group1.append(tinnitus_ctrl)
full_tinnitus_group1



Unnamed: 0,FID,IID,sex,tinnitus_ctrl_age,tinnitus_1
0,1000105,1000105,1,,1
1,1000331,1000331,1,,1
2,1000439,1000439,1,,1
3,1000443,1000443,1,,1
4,1000494,1000494,0,,1
...,...,...,...,...,...
124987,6025354,6025354,1,52,0
124988,6025378,6025378,0,42,0
124989,6025390,6025390,1,67,0
124990,6025409,6025409,1,66,0


In [268]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group2.columns[4]})
full_tinnitus_group2 = tinnitus_group2.append(tinnitus_ctrl)
full_tinnitus_group2



Unnamed: 0,FID,IID,sex,tinnitus_ctrl_age,tinnitus_2
0,1000439,1000439,1,,1
1,1000713,1000713,0,,1
2,1000728,1000728,0,,1
3,1001045,1001045,0,,1
4,1001052,1001052,1,,1
...,...,...,...,...,...
124987,6025354,6025354,1,52,0
124988,6025378,6025378,0,42,0
124989,6025390,6025390,1,67,0
124990,6025409,6025409,1,66,0


In [269]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:tinnitus_group3.columns[4]})
full_tinnitus_group3 = tinnitus_group3.append(tinnitus_ctrl)
full_tinnitus_group3



Unnamed: 0,FID,IID,sex,tinnitus_ctrl_age,tinnitus_3
0,1000713,1000713,0,,1
1,1000728,1000728,0,,1
2,1001123,1001123,1,,1
3,1001162,1001162,1,,1
4,1001929,1001929,0,,1
...,...,...,...,...,...
124987,6025354,6025354,1,52,0
124988,6025378,6025378,0,42,0
124989,6025390,6025390,1,67,0
124990,6025409,6025409,1,66,0


In [270]:
full_tinnitus_group1[["tinnitus_1"]].value_counts()

tinnitus_1
0             124992
1              53735
dtype: int64

In [271]:
full_tinnitus_group2[["tinnitus_2"]].value_counts()

tinnitus_2
0             124992
1              34817
dtype: int64

In [272]:
full_tinnitus_group3[["tinnitus_3"]].value_counts()

tinnitus_3
0             124992
1              18112
dtype: int64

In [274]:
full_tinnitus_group1.to_csv("~/project_bst/tinnitus/tinnitus_case_group1_all_53735cases_124992controls.tsv", sep='\t', index=False)
full_tinnitus_group2.to_csv("~/project_bst/tinnitus/tinnitus_case_group2_all_34817cases_124992controls.tsv", sep='\t', index=False)
full_tinnitus_group3.to_csv("~/project_bst/tinnitus/tinnitus_case_group3_all_18112cases_124992controls.tsv", sep='\t', index=False)

### Tinnitus with all ARHL samples

In [275]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_all_HL_samples.tsv"
tinnitus_1_file_name = "tinnitus_group1_pheno_all_HL_samples.tsv"
tinnitus_2_file_name = "tinnitus_group2_pheno_all_HL_samples.tsv"
tinnitus_3_file_name = "tinnitus_group3_pheno_all_HL_samples.tsv"

In [280]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t",dtype='string')
tinnitus_group1 = pd.read_csv(tinnitus_1_file_name, sep="\t",dtype='string')
tinnitus_group2 = pd.read_csv(tinnitus_2_file_name, sep="\t",dtype='string')
tinnitus_group3 = pd.read_csv(tinnitus_3_file_name, sep="\t",dtype='string')

In [284]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[7]:tinnitus_group1.columns[7]})
full_tinnitus_group1 = tinnitus_group1.append(tinnitus_ctrl)
full_tinnitus_group1



Unnamed: 0,FID,IID,sex,age,array,ARHL_pheno,tinnitus_ctrl_age,tinnitus_1
0,1000105,1000105,1,54,1,1,,1
1,1000331,1000331,1,53,1,0,,1
2,1000439,1000439,1,59,1,0,,1
3,1000494,1000494,0,61,1,1,,1
4,1000728,1000728,0,61,1,1,,1
...,...,...,...,...,...,...,...,...
115291,6025354,6025354,1,52,1,1,52,0
115292,6025378,6025378,0,42,1,1,42,0
115293,6025390,6025390,1,67,1,1,67,0
115294,6025409,6025409,1,66,1,0,66,0


In [285]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[7]:tinnitus_group2.columns[7]})
full_tinnitus_group2 = tinnitus_group2.append(tinnitus_ctrl)
full_tinnitus_group2



Unnamed: 0,FID,IID,sex,age,array,ARHL_pheno,tinnitus_ctrl_age,tinnitus_2
0,1000439,1000439,1,59,1,0,,1
1,1000728,1000728,0,61,1,1,,1
2,1001045,1001045,0,61,1,1,,1
3,1001052,1001052,1,64,1,1,,1
4,1001067,1001067,0,50,1,1,,1
...,...,...,...,...,...,...,...,...
115291,6025354,6025354,1,52,1,1,52,0
115292,6025378,6025378,0,42,1,1,42,0
115293,6025390,6025390,1,67,1,1,67,0
115294,6025409,6025409,1,66,1,0,66,0


In [286]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[7]:tinnitus_group3.columns[7]})
full_tinnitus_group3 = tinnitus_group3.append(tinnitus_ctrl)
full_tinnitus_group3



Unnamed: 0,FID,IID,sex,age,array,ARHL_pheno,tinnitus_ctrl_age,tinnitus_3
0,1000728,1000728,0,61,1,1,,1
1,1001123,1001123,1,62,1,1,,1
2,1001162,1001162,1,70,2,0,,1
3,1001929,1001929,0,51,1,1,,1
4,1002859,1002859,0,69,1,0,,1
...,...,...,...,...,...,...,...,...
115291,6025354,6025354,1,52,1,1,52,0
115292,6025378,6025378,0,42,1,1,42,0
115293,6025390,6025390,1,67,1,1,67,0
115294,6025409,6025409,1,66,1,0,66,0


In [287]:
full_tinnitus_group1[["tinnitus_1"]].value_counts()

tinnitus_1
0             115296
1              48553
dtype: int64

In [288]:
full_tinnitus_group2[["tinnitus_2"]].value_counts()

tinnitus_2
0             115296
1              31437
dtype: int64

In [289]:
full_tinnitus_group3[["tinnitus_3"]].value_counts()

tinnitus_3
0             115296
1              16441
dtype: int64

In [290]:
full_tinnitus_group1.to_csv("~/project_bst/tinnitus/tinnitus_case_group1_all_HL_48553cases_115296controls.tsv", sep='\t', index=False)
full_tinnitus_group2.to_csv("~/project_bst/tinnitus/tinnitus_case_group2_all_HL_31437cases_115296controls.tsv", sep='\t', index=False)
full_tinnitus_group3.to_csv("~/project_bst/tinnitus/tinnitus_case_group3_all_HL_16441cases_115296controls.tsv", sep='\t', index=False)

### Tinnitus with only ARHL control samples

In [291]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_controls.tsv"
tinnitus_1_file_name = "tinnitus_group1_pheno_controls.tsv"
tinnitus_2_file_name = "tinnitus_group2_pheno_controls.tsv"
tinnitus_3_file_name = "tinnitus_group3_pheno_controls.tsv"

In [293]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t",dtype='string')
tinnitus_group1 = pd.read_csv(tinnitus_1_file_name, sep="\t",dtype='string')
tinnitus_group2 = pd.read_csv(tinnitus_2_file_name, sep="\t",dtype='string')
tinnitus_group3 = pd.read_csv(tinnitus_3_file_name, sep="\t",dtype='string')

In [296]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[6]:tinnitus_group1.columns[6]})
full_tinnitus_group1 = tinnitus_group1.append(tinnitus_ctrl)
full_tinnitus_group1



Unnamed: 0,FID,IID,sex,age,array,tinnitus_ctrl_age,tinnitus_1
0,1000331,1000331,1,53,1,,1
1,1000439,1000439,1,59,1,,1
2,1000992,1000992,0,56,1,,1
3,1001162,1001162,1,70,2,,1
4,1001179,1001179,1,66,1,,1
...,...,...,...,...,...,...,...
73504,6025227,6025227,1,68,1,68,0
73505,6025230,6025230,1,52,1,52,0
73506,6025319,6025319,1,56,1,56,0
73507,6025409,6025409,1,66,1,66,0


In [297]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[6]:tinnitus_group2.columns[6]})
full_tinnitus_group2 = tinnitus_group2.append(tinnitus_ctrl)
full_tinnitus_group2



Unnamed: 0,FID,IID,sex,age,array,tinnitus_ctrl_age,tinnitus_2
0,1000439,1000439,1,59,1,,1
1,1001162,1001162,1,70,2,,1
2,1001395,1001395,1,64,1,,1
3,1001566,1001566,0,66,1,,1
4,1001744,1001744,1,50,1,,1
...,...,...,...,...,...,...,...
73504,6025227,6025227,1,68,1,68,0
73505,6025230,6025230,1,52,1,52,0
73506,6025319,6025319,1,56,1,56,0
73507,6025409,6025409,1,66,1,66,0


In [298]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[6]:tinnitus_group3.columns[6]})
full_tinnitus_group3 = tinnitus_group3.append(tinnitus_ctrl)
full_tinnitus_group3



Unnamed: 0,FID,IID,sex,age,array,tinnitus_ctrl_age,tinnitus_3
0,1001162,1001162,1,70,2,,1
1,1002859,1002859,0,69,1,,1
2,1002926,1002926,0,76,1,,1
3,1003431,1003431,1,61,1,,1
4,1003485,1003485,1,54,1,,1
...,...,...,...,...,...,...,...
73504,6025227,6025227,1,68,1,68,0
73505,6025230,6025230,1,52,1,52,0
73506,6025319,6025319,1,56,1,56,0
73507,6025409,6025409,1,66,1,66,0


In [299]:
full_tinnitus_group1[["tinnitus_1"]].value_counts()

tinnitus_1
0             73509
1             17199
dtype: int64

In [300]:
full_tinnitus_group2[["tinnitus_2"]].value_counts()

tinnitus_2
0             73509
1              9207
dtype: int64

In [301]:
full_tinnitus_group3[["tinnitus_3"]].value_counts()

tinnitus_3
0             73509
1              3511
dtype: int64

In [302]:
full_tinnitus_group1.to_csv("~/project_bst/tinnitus/tinnitus_case_group1_only_HLcontrols_17199cases_73509controls.tsv", sep='\t', index=False)
full_tinnitus_group2.to_csv("~/project_bst/tinnitus/tinnitus_case_group2_all_only_HLcontrols_9207cases_73509controls.tsv", sep='\t', index=False)
full_tinnitus_group3.to_csv("~/project_bst/tinnitus/tinnitus_case_group3_all_only_HLcontrols_3511cases_73509controls.tsv", sep='\t', index=False)