# Generate the phenotypes for the hearing impairment traits from the UKBB

In [2]:
import pandas as pd
import numpy as np

# Read in the data

In [3]:
exclusion = pd.read_csv("~/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32,N,N,,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218,N,N,,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49,N,N,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
571,f.20002,1491 brain haemorrhage,218,Y,,,,,,,,
572,f.20002,1583 ischaemic stroke,44,N,N,,,,,,,
573,f.20002,1082 transient ischaemic attack (tia),2243,N,N,,,,,,,
574,f.20002,1083 subdural haemorrhage/haematoma,212,Y,,,,,,,,


In [4]:
outlier = pd.read_csv("~/030821_ukb42495_exomed_white_189010ind.pheno.white_expanded_07_09_21_genoarray_projected.pca.projected.outliers", sep="\t")
outlier

Unnamed: 0,1008606,1008606.1
0,1010412,1010412
1,1045757,1045757
2,1057699,1057699
3,1069457,1069457
4,1071240,1071240
...,...,...
562,4773865,4773865
563,5109700,5109700
564,5637210,5637210
565,5748329,5748329


In [5]:
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/062421_UKBB_HI_exomes_189009ind.csv", quotechar = '"', dtype="string")
df

Unnamed: 0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,...,f.131228.0.0,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
0,1000019,1000019,0,0,2,-9,Female,1960,2008-01-24,,...,,,,,,,,,,
1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,...,,,,,,,,,,
2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,2012-11-01,...,,,,,,,,,,
3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,,...,,,,,,,,,,
4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,,...,1990-04-05,Primary care only,1991-02-25,Primary care only,,,,,,
189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,,...,,,,,,,,,,
189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,,...,,,,,,,,,,
189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,,...,,,,,,,,,,


# Remove outliers from the Full database

In [6]:
out_ids = outlier["1008606"].to_list()

#find_outliers = lambda row: row["IID"] in out_ids

def find_outliers(row):
    return row["IID"] in out_ids

In [7]:
exclude = df[["IID", "FID"]].apply(find_outliers, axis=1)

In [8]:
sum(exclude)

0

In [9]:
df = df[~exclude]

# Filter Out Exclusions from the Full database

## Filter out ICD 10 exclusions

In [10]:
exclude_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
27,f.41270,H65.2 Chronic serous otitis media,103,Y,,,,,,,,
28,f.41270,H65.3 Chronic mucoid otitis media,960,Y,,,,,,,,
29,f.41270,H65.4 Other chronic nonsuppurative otitis media,158,Y,,,,,,,,
30,f.41270,"H65.9 Nonsuppurative otitis media, unspecified",508,Y,,,,,,,,
33,f.41270,H66.1 Chronic tubotympanic suppurative otitis ...,40,Y,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
276,f.41270,"S07.9 Crushing injury of head, part unspecified",1,Y,,,,,,,,
279,f.41270,S08.1 Traumatic amputation of ear,13,Y,,,,,,,,
280,f.41270,S08.8 Traumatic amputation of other parts of head,1,Y,,,,,,,,
281,f.41270,S08.9 Traumatic amputation of unspecified part...,1,Y,,,,,,,,


In [11]:
icd10_colnames = [col for col in df if "f.41270" in col]

In [12]:
icd10 = df[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,...,,,,,,,,,,
1,H269,K579,K590,K621,N40,R398,Z466,Z538,,,...,,,,,,,,,,
2,D037,L720,,,,,,,,,...,,,,,,,,,,
3,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,...,,,,,,,,,,
4,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,...,,,,,,,,,,
189005,E669,I10,K801,K802,K85,M179,M233,N921,R102,,...,,,,,,,,,,
189006,O074,,,,,,,,,,...,,,,,,,,,,
189007,,,,,,,,,,,...,,,,,,,,,,


In [13]:
ex_critia_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd10["Phenotype"].tolist()]
ex_critia_icd10

['H652',
 'H653',
 'H654',
 'H659',
 'H661',
 'H662',
 'H663',
 'H664',
 'H669',
 'H680',
 'H701',
 'H702',
 'H708',
 'H709',
 'H71',
 'H731',
 'H738',
 'H739',
 'H740',
 'H741',
 'H742',
 'H743',
 'H748',
 'H749',
 'H750',
 'H758',
 'H800',
 'H801',
 'H802',
 'H808',
 'H809',
 'H810',
 'H830',
 'H831',
 'H832',
 'H900',
 'H901',
 'H902',
 'H910',
 'H933',
 'H940',
 'H948',
 'H950',
 'H951',
 'H958',
 'H959',
 'B020',
 'B021',
 'B022',
 'B023',
 'B027',
 'B028',
 'G000',
 'G001',
 'G002',
 'G003',
 'G008',
 'G009',
 'G01',
 'G020',
 'G021',
 'G028',
 'G030',
 'G031',
 'G032',
 'G038',
 'G039',
 'G040',
 'G041',
 'G042',
 'G048',
 'G049',
 'G050',
 'G051',
 'G052',
 'G058',
 'G060',
 'G061',
 'G062',
 'G07',
 'G08',
 'G09',
 'G510',
 'G511',
 'G512',
 'G513',
 'G514',
 'G518',
 'G519',
 'S0200',
 'S0201',
 'S0210',
 'S0211',
 'S0240',
 'S0241',
 'S0260',
 'S0261',
 'S0270',
 'S0271',
 'S0280',
 'S0281',
 'S0290',
 'S0291',
 'S045',
 'S046',
 'S049',
 'S0600',
 'S0601',
 'S0610',
 'S0611

In [14]:
def contains_exclusion(row, exclusion_list):
    for i in row:
        try: # this is to catch the nan issue
            if i in exclusion_list:
                return True
        except:
            continue
            
    return False

In [15]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [16]:
filtered = df[~ex_10]

In [17]:
filtered

Unnamed: 0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,...,f.131228.0.0,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,...,,,,,,,,,,
2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,2012-11-01,...,,,,,,,,,,
3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,,...,,,,,,,,,,
4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,,...,,,,,,,,,,
5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,,...,1990-04-05,Primary care only,1991-02-25,Primary care only,,,,,,
189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,,...,,,,,,,,,,
189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,,...,,,,,,,,,,
189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,,...,,,,,,,,,,


## Filter out ICD 9 exclusions

In [18]:
exclude_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
324,f.41271,3811 Chronic serous otitis media,8,Y,,,,,,,,
325,f.41271,3812 Chronic mucoid otitis media,11,Y,,,,,,,,
326,f.41271,3813 Other and unspecified chronic nonsuppurat...,3,Y,,,,,,,,
327,f.41271,"3814 Nonsuppurative otitis media, not specifie...",19,Y,,,,,,,,
328,f.41271,3815 Eustachian salpingitis,0,Y,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
521,f.41271,9050 Late effect of fracture of skull and face...,19,Y,,,,,,,,
531,f.41271,"9259 Crushing injury of face, scalp and neck",2,Y,,,,,,,,
537,f.41271,9514 Injury to facial nerve,0,Y,,,,,,,,
538,f.41271,9515 Injury to acoustic nerve,1,Y,,,,,,,,


In [19]:
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [20]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,3899,4781,8131,E8860,V540,,,,,,...,,,,,,,,,,
189005,,,,,,,,,,,...,,,,,,,,,,
189006,,,,,,,,,,,...,,,,,,,,,,
189007,,,,,,,,,,,...,,,,,,,,,,


In [21]:
ex_critia_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd9["Phenotype"].tolist()]
ex_critia_icd9

['3811',
 '3812',
 '3813',
 '3814',
 '3815',
 '3816',
 '3819',
 '3821',
 '3822',
 '3823',
 '3824',
 '3829',
 '3831',
 '3832',
 '3833',
 '3838',
 '3839',
 '3841',
 '3850',
 '3851',
 '3852',
 '3853',
 '3858',
 '3859',
 '3860',
 '3863',
 '3864',
 '3865',
 '3868',
 '3869',
 '3870',
 '3871',
 '3872',
 '3878',
 '3879',
 '3885',
 '3890',
 '0530',
 '0531',
 '0532',
 '0537',
 '0538',
 '3200',
 '3201',
 '3202',
 '3203',
 '3204',
 '3205',
 '3207',
 '3208',
 '3209',
 '3210',
 '3211',
 '3212',
 '3213',
 '3214',
 '3215',
 '3216',
 '3217',
 '3218',
 '3220',
 '3221',
 '3222',
 '3229',
 '3230',
 '3231',
 '3232',
 '3233',
 '3234',
 '3235',
 '3236',
 '3237',
 '3238',
 '3239',
 '3240',
 '3241',
 '3249',
 '3259',
 '3269',
 '3510',
 '3511',
 '3518',
 '3519',
 '8000',
 '8001',
 '8002',
 '8003',
 '8010',
 '8011',
 '8012',
 '8013',
 '8022',
 '8023',
 '8024',
 '8025',
 '8028',
 '8029',
 '8030',
 '8031',
 '8032',
 '8033',
 '8040',
 '8041',
 '8042',
 '8043',
 '8509',
 '8510',
 '8511',
 '8520',
 '8521',
 '8530',
 

In [22]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [23]:
filtered = filtered[~ex_9]

In [24]:
filtered

Unnamed: 0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,...,f.131228.0.0,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,...,,,,,,,,,,
2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,2012-11-01,...,,,,,,,,,,
3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,,...,,,,,,,,,,
4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,,...,,,,,,,,,,
5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,,...,1990-04-05,Primary care only,1991-02-25,Primary care only,,,,,,
189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,,...,,,,,,,,,,
189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,,...,,,,,,,,,,
189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,,...,,,,,,,,,,


## Filter out f.20002 exclusions

In [25]:
exclude_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
544,f.20002,1420 otosclerosis,260,Y,,,,,,,,
545,f.20002,1421 meniere's disease,1553,Y,,,,,,,,
546,f.20002,1499 labyrinthitis,417,Y,,,,,,,,
550,f.20002,1244 infection of nervous system,55,Y,,,,,,,,
551,f.20002,1245 brain abscess/intracranial abscess,79,Y,,,,,,,,
552,f.20002,1246 encephalitis,348,Y,,,,,,,,
553,f.20002,1247 meningitis,2214,Y,,,,,,,,
555,f.20002,1249 cranial nerve problem/palsy,289,Y,,,,,,,,
556,f.20002,1250 bell's palsy/facial nerve palsy,591,Y,,,,,,,,
558,f.20002,1240 neurological injury/trauma,130,Y,,,,,,,,


In [26]:
f20002_colnames = [col for col in filtered if "f.20002" in col]

In [27]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
1,1396,1473,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1075,1440,1473,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1065,1123,1286,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,,,,,,,,,,,...,,,,,,,,,,
189005,1065,,,,,,,,,,...,,,,,,,,,,
189006,1452,1265,1387,,,,,,,,...,,,,,,,,,,
189007,,,,,,,,,,,...,,,,,,,,,,


In [28]:
ex_critia_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_f20002["Phenotype"].tolist()]
ex_critia_f20002

['1420',
 '1421',
 '1499',
 '1244',
 '1245',
 '1246',
 '1247',
 '1249',
 '1250',
 '1240',
 '1626',
 '1086',
 '1491',
 '1083',
 '1425']

In [29]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [30]:
filtered = filtered[~ex_f20002]

In [31]:
filtered

Unnamed: 0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,...,f.131228.0.0,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,...,,,,,,,,,,
2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,2012-11-01,...,,,,,,,,,,
3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,,...,,,,,,,,,,
4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,,...,,,,,,,,,,
5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,,...,1990-04-05,Primary care only,1991-02-25,Primary care only,,,,,,
189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,,...,,,,,,,,,,
189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,,...,,,,,,,,,,
189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,,...,,,,,,,,,,


### Identify Sex Column

In [33]:
def find_sex(row):
    if row["f.31.0.0"] == "Male":
        return 0
    return 1

sex = filtered[["f.31.0.0"]].apply(find_sex, axis=1)
sex

1         0
2         1
3         0
4         1
5         0
         ..
189004    0
189005    1
189006    1
189007    0
189008    1
Length: 182675, dtype: int64

In [34]:
filtered["sex"] = sex

  filtered["sex"] = sex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [35]:
filtered

Unnamed: 0,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,f.53.1.0,...,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex
1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,,...,,,,,,,,,,0
2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,2012-11-01,...,,,,,,,,,,1
3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,,...,,,,,,,,,,0
4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,,...,,,,,,,,,,1
5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,,...,Primary care only,1991-02-25,Primary care only,,,,,,,0
189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,,...,,,,,,,,,,1
189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,,...,,,,,,,,,,1
189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,,...,,,,,,,,,,0


In [36]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [37]:
saved_filtered = filtered

In [38]:
filtered = saved_filtered

# f.3393, f.2247, f.2257

## Remove inconsistencies or unclear individuals

Some individuals might be unclear on if they do or do not have hearing difficulties or are inconsistent (found in f.3393, f.2247, and f.2257 ), in which case they cannot be considered either controls or cases and must be removed.

The conditions for being removed are as follows:
* Saying I don't know after saying either yes or no
* Only saying I don't know or prefer not to say
* Being completely deaf

### Prior to filtering for inconsistencies

<b>Hearing difficulty/problems with background noise</b> <br>
f.2257 = {'Yes': 81218, NA : 513774, 'No': 131091, 'Do not know': 4409, 'Prefer not to answer': 208}

<b>Hearing difficult/problems</b><br>
f.2247 = {'No': 151758, : 513806, 'Yes': 55437, 'Do not know': 9489, 'Prefer not to answer': 171, 'I am completely deaf': 39}

<b>Hearing aid user</b><br>
f.3393 = {'No': 145486, : 577795, 'Yes': 7237, 'Prefer not to answer': 182}

In [39]:
phens = ["f.3393", "f.2247", "f.2257"]

hearing_imp_f3393 = [col for col in filtered if "f.3393" in col]
hearing_imp_f2247 = [col for col in filtered if "f.2247" in col]
hearing_imp_f2257 = [col for col in filtered if "f.2257" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [40]:
hearing_ans = {"Do not know":9, "Yes":1, "No":0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    options.add(answer)

In [41]:
hearing_imp_qs = filtered[hearing_imp_f3393]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2247]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2257]
s = hearing_imp_qs.apply(find_options, axis=1)

In [42]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '0009',
 '001',
 '0010',
 '0011',
 '0019',
 '009',
 '0090',
 '0091',
 '0099',
 '01',
 '010',
 '0100',
 '0101',
 '011',
 '0110',
 '0111',
 '019',
 '09',
 '090',
 '0900',
 '0901',
 '091',
 '0910',
 '0911',
 '099',
 '0991',
 '1',
 '10',
 '100',
 '1000',
 '1001',
 '101',
 '1011',
 '109',
 '1090',
 '1099',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '1119',
 '119',
 '1190',
 '1191',
 '19',
 '190',
 '1900',
 '191',
 '199',
 '9',
 '90',
 '900',
 '9000',
 '901',
 '909',
 '91',
 '910',
 '911',
 '9110',
 '9111',
 '919',
 '99',
 '990',
 '991',
 '999',
 '9999'}

In [43]:
just_do_not_know = [i for i in options if '0' not in i and '9' in i and '1' not in i]
do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [44]:
inconsistent

['09',
 '0099',
 '0900',
 '090',
 '099',
 '009',
 '0009',
 '0090',
 '909',
 '1119',
 '199',
 '191',
 '1191',
 '119',
 '919',
 '19',
 '1011',
 '0100',
 '1110',
 '010',
 '101',
 '1000',
 '0101',
 '10',
 '1001',
 '100',
 '1101',
 '0010',
 '110',
 '1100',
 '0110',
 '109',
 '9110',
 '0019',
 '1099',
 '910',
 '019',
 '190',
 '1190',
 '0910',
 '1090',
 '0901',
 '1900']

In [45]:
def find_empty(row):
    for i in row:
        if not pd.isna(i):
            return False
    return True

In [46]:
def find_dont_know(row):
    temp = []
    for i in row:
        if not pd.isna(i):
            temp.append(i)
        
    if "Do not know" in temp and "Yes" not in temp and "No" not in temp:
        return True
    return False

In [47]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

# if we have individuals that either don't answer or prefer not to say only then we cancel them out
# return true if all the rows have no definitive answers
def find_all_none(row):
    for i in row:
        if not pd.isna(i) and (i == "Yes" or i == "No"):
            return False
    return True

### Error checking numbers 

In [48]:
hearing_imp_qs = filtered[hearing_imp_f2257]
sum(hearing_imp_qs.apply(find_empty, axis=1))

232

In [49]:
hearing_imp_qs = filtered[hearing_imp_f2257]
sum(hearing_imp_qs.apply(find_dont_know, axis=1))

3187

In [50]:
hearing_imp_qs = filtered[hearing_imp_f2247]
sum(hearing_imp_qs.apply(find_dont_know, axis=1))

6759

In [51]:
hearing_imp_qs = filtered[hearing_imp_f3393]
sum(hearing_imp_qs.apply(find_dont_know, axis=1))

0

In [52]:
hearing_imp_qs = filtered[hearing_imp_f2257]
sum(hearing_imp_qs.apply(find_inconsistencies, axis=1))

2933

In [53]:
hearing_imp_qs = filtered[hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257]
sum(hearing_imp_qs.apply(find_all_none, axis=1))

708

### Filtering out the data

In [54]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,,0
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,,1
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,Primary care only,1991-02-25,Primary care only,,,,,,,0
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,,1
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,,1
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,,0


In [55]:
hearing_imp_qs = filtered[hearing_imp_f3393]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

hearing_imp_qs = filtered[hearing_imp_f2247]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

hearing_imp_qs = filtered[hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

hearing_imp_qs = filtered[hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_all_none, axis=1)
filtered = filtered[~exclude]

In [56]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,,0
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,,1
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,Primary care only,1991-02-25,Primary care only,,,,,,,0
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,,1
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,,1
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,,0


In [57]:
saved_2_filtered = filtered

In [58]:
filtered = saved_2_filtered

## Identify Pure Controls

Need to make sure that for f.3393, f.2247, and f.2257 we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all f.3393, f.2247, and f.2257). However these individuals can still be part of the cases

In [59]:
# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return 0
    return 1

# returns 0 if it's a ctrl or else 1
# this is specific for f3393
def find_ctrl_or_NA(row):
    for i in row:
        if not pd.isna(i) and i != "No" and i != "Prefer not to answer": # if we have any answers that are not NA or No only then we don't have a ctrl
            return 1
    return 0


In [60]:
hearing_imp_qs = filtered[hearing_imp_f3393]
f3393_ctrl = hearing_imp_qs.apply(find_ctrl_or_NA, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2247]
f2247_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2257]
f2257_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()

In [61]:
pure_ctrl = [0 if i == 0 and f2247_ctrl[en] == 0 and f2257_ctrl[en] == 0 else 1 for en, i in enumerate(f3393_ctrl)]

In [62]:
sum(pure_ctrl)

78593

In [63]:
len(pure_ctrl)

176878

### Remove ICD 10 Codes from CTRL

In [64]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24,N,Y,,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51,N,Y,,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33,N,Y,,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408,N,Y,,,,,N,,


In [65]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [66]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212
0,H269,K579,K590,K621,N40,R398,Z466,Z538,,,...,,,,,,,,,,
1,D037,L720,,,,,,,,,...,,,,,,,,,,
2,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,...,,,,,,,,,,
3,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,...,,,,,,,,,,
4,C73,C780,D70,E780,F412,G473,G479,G620,I48,I828,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,...,,,,,,,,,,
182671,E669,I10,K801,K802,K85,M179,M233,N921,R102,,...,,,,,,,,,,
182672,O074,,,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [67]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### Remove ICD 9 Codes from Ctrl

In [68]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
376,f.41271,3880 Degenerative and vascular disorders of ear,0,N,Y,,,,,,,
377,f.41271,3881 Noise effects on inner ear,0,N,Y,,,,,,,
378,f.41271,"3882 Sudden hearing loss, unspecified",0,N,Y,,,,,,,
380,f.41271,3884 Other abnormal auditory perception,0,N,Y,,,,,,,
384,f.41271,3888 Other specified disorders of ear,1,N,Y,,,,,,,
385,f.41271,"3889 Disorders of ear, unspecified",2,N,Y,,,,,,,
388,f.41271,3891 Sensorineural deafness,6,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
389,f.41271,3892 Mixed conductive and sensorineural deafness,1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
390,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
391,f.41271,3898 Other specified forms of deafness,0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,


In [69]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [70]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,3899,4781,8131,E8860,V540,,,,,,...,,,,,,,,,,
182671,,,,,,,,,,,...,,,,,,,,,,
182672,,,,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [71]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### Remove f20002 Codes from Ctrl

In [72]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations


In [73]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1396,1473,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,1075,1440,1473,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1065,1123,1286,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,,,,,,,,,,,...,,,,,,,,,,
182671,1065,,,,,,,,,,...,,,,,,,,,,
182672,1452,1265,1387,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [74]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

[]

In [75]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [76]:
sum(ex_10), sum(pure_ctrl)

(1659, 78593)

In [77]:
len(pure_ctrl), len(ex_10)

(176878, 176878)

In [78]:
sum(pure_ctrl | ex_10 | ex_9 | ex_f20002)

78796

In [79]:
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

In [80]:
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [81]:
filtered["hearing_imp_pure_ctrl"] = filtered_ctrl

  filtered["hearing_imp_pure_ctrl"] = filtered_ctrl


In [82]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex,hearing_imp_pure_ctrl
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,0,1
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,1,0
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,0,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,1,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,1991-02-25,Primary care only,,,,,,,0,1
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,1,0
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,1,0
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,0,0


In [83]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [84]:
saved_3_filtered = filtered

In [94]:
filtered = saved_3_filtered

### Identify Age and Phenotype Columns

In [95]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [96]:
filtered[ages_f21003_col]

Unnamed: 0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
0,63,,,
1,52,57,60,
2,67,,,
3,41,,,
4,66,,,
...,...,...,...,...
176873,46,,,
176874,56,,,
176875,53,,,
176876,64,,,


In [97]:
def get_ctrl_age(row):
    phens = [hearing_imp_f3393, hearing_imp_f2247, hearing_imp_f2257]
    ages = []
    if row["hearing_imp_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i == "No":
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[ages_f21003_col][en]
    else:
        return pd.NA
    
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA
    

In [98]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i == "Yes":
            return 1
    return 0

# return 1 if we have a match for the mendelian traits and have at least one of the hearing phenotypes
def find_medelian_like(row):
    mendelian_icd10 = ["H903", "H905", "H906", "H908", "H913", "H918", "H919"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]
    
    for i in row[icd_10_cols]:
        if not pd.isna(i) and i in mendelian_icd10:
            return 1
    for i in row[icd_9_cols]:
        if not pd.isna(i) and i in mendelian_icd9:
            return 1
    return 0

def find_exclusions(row):
    mendelian_icd10 = ["H903", "H904", "H905", "H906", "H907", "H908"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]
    
    try:
        if 1 == row[0]: # the first column will be one of the phenotypes, "f3393", "f2247", or "f2257"
            for i in row[icd_10_cols]:
                if not pd.isna(i) and i in mendelian_icd10 and int(row[1]) <= 55: # row[1] must be the age of the phenotype
                    return 0
            for i in row[icd_9_cols]:
                if not pd.isna(i) and i in mendelian_icd9 and int(row[1]) <= 55:
                    return 0
        return int(row[0])
    except:
        print(row.name)

# return 1 if we have a match for the other cases of f3393 or originally had f3393
def find_f3393_other_cases(row):
    mendelian_icd10 = ["Z461", "Z974"]
    mendelian_icd9 = ["V412", "V532"]
    if 0 == int(row["f3393"]):
        for i in row[icd_10_cols]:
            if not pd.isna(i) and i in mendelian_icd10:
                return 1
        for i in row[icd_9_cols]:
            if not pd.isna(i) and i in mendelian_icd9:
                return 1
    return int(row["f3393"])

def check_code(row):
    for i in row:
        if not pd.isna(i) and i == "H919":
            return 1
    return 0

In [99]:
hearing_imp_qs = filtered[hearing_imp_f3393]
filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered["f3393"] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)
filtered["f3393"] = filtered[["f3393"] + icd_10_cols + icd_9_cols].apply(find_f3393_other_cases, axis=1)

hearing_imp_qs = filtered[hearing_imp_f2247]
filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered["f2247"] = filtered[["f2247", "f2247_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)

hearing_imp_qs = filtered[hearing_imp_f2257]
filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered["f2257"] = filtered[["f2257", "f2257_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)

  filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)
  filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)


In [100]:
filtered["mendelian_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)
filtered["mendelian"] = filtered[icd_10_cols + icd_9_cols + ["f3393", "f2247", "f2257"]].apply(find_medelian_like, axis=1)

  filtered["mendelian_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)
  filtered["mendelian"] = filtered[icd_10_cols + icd_9_cols + ["f3393", "f2247", "f2257"]].apply(find_medelian_like, axis=1)


In [101]:
filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)

  filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)


In [102]:
filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)

  filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
  filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)


## File Output

In [103]:
filtered

Unnamed: 0,level_0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,...,f3393_age,f2247,f2247_age,f2257,f2257_age,mendelian_age,mendelian,ctrl_age,f2247_f2257,f2247_f2257_age
0,0,1,1000035,1000035,0,0,1,-9,Male,1944,...,,0,,1,63,63,0,,0,63
1,1,2,1000078,1000078,0,0,2,-9,Female,1955,...,,0,,0,,52,0,60,0,
2,2,3,1000081,1000081,0,0,1,-9,Male,1942,...,,0,,0,,67,0,67,0,
3,3,4,1000198,1000198,0,0,2,-9,Female,1967,...,,1,41,1,41,41,0,,1,41
4,4,5,1000210,1000210,0,0,1,-9,Male,1941,...,,0,,0,,66,0,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176873,182670,189004,6025295,6025295,0,0,1,-9,Male,1961,...,,0,,0,,46,1,,0,
176874,182671,189005,6025319,6025319,0,0,2,-9,Female,1953,...,,0,,0,,56,0,56,0,
176875,182672,189006,6025346,6025346,0,0,2,-9,Female,1954,...,,0,,0,,53,0,53,0,
176876,182673,189007,6025363,6025363,0,0,1,-9,Male,1944,...,,0,,0,,64,0,64,0,


In [104]:
filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "sex", "hearing_imp_pure_ctrl", "ctrl_age"]].to_csv("pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [106]:
filtered[filtered["f3393"] == 1][["FID", "IID", "sex", "f3393", "f3393_age"]].to_csv("f3393_pheno_file.tsv", sep='\t', index=False)

In [107]:
filtered[filtered["f2247"] == 1][["FID", "IID", "sex", "f2247", "f2247_age"]].to_csv("f2247_pheno_file.tsv", sep='\t', index=False)

In [108]:
filtered[filtered["f2257"] == 1][["FID", "IID", "sex", "f2257", "f2257_age"]].to_csv("f2257_pheno_file.tsv", sep='\t', index=False)

In [109]:
filtered[filtered["f2247_f2257"] == 1][["FID", "IID", "sex", "f2247_f2257", "f2247_f2257_age"]].to_csv("f2247_f2257_pheno_file.tsv", sep='\t', index=False)

In [112]:
filtered[filtered["mendelian"] == 1][["FID", "IID", "sex", "mendelian", "mendelian_age"]].to_csv("mendelian_pheno_file.tsv", sep='\t', index=False)

# Tinnitus

## Remove inconsistencies or unclear individuals

### Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': 76141,
 'Yes, but not now, but have in the past': 11400,
 'Yes, now some of the time': 9788,
 'Yes, now a lot of the time': 2973,
 'Yes, now most or all of the time': 7426,
 'Do not know': 1745,
 'Prefer not to answer': 127}

In [113]:
filtered = saved_filtered

In [114]:
tin_cols = [col for col in filtered if "f.4803" in col]

In [115]:
tin_ans = {"Do not know":9, "Yes":1, "No":0, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [116]:
s = filtered[tin_cols].apply(find_options, axis=1)

In [117]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '001',
 '0010',
 '0011',
 '009',
 '01',
 '010',
 '0100',
 '011',
 '0111',
 '019',
 '09',
 '090',
 '091',
 '099',
 '1',
 '10',
 '100',
 '1000',
 '101',
 '1010',
 '1011',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '119',
 '19',
 '190',
 '191',
 '1919',
 '199',
 '9',
 '90',
 '900',
 '901',
 '909',
 '91',
 '911',
 '99',
 '990',
 '991'}

In [118]:
just_do_not_know = [i for i in options if '0' not in i and '9' in i and '1' not in i]
do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [119]:
inconsistent

['09',
 '090',
 '099',
 '009',
 '909',
 '199',
 '191',
 '1919',
 '119',
 '19',
 '1011',
 '0100',
 '1110',
 '010',
 '1000',
 '101',
 '10',
 '1010',
 '100',
 '1101',
 '0010',
 '110',
 '1100',
 '019',
 '190']

### Filtering out the data

In [120]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,,0
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,,1
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,Primary care only,1991-02-25,Primary care only,,,,,,,0
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,,1
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,,1
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,,0


In [121]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [122]:
exclude = filtered[tin_cols].apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [123]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131229.0.0,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,,0
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,,1
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,Primary care only,1991-02-25,Primary care only,,,,,,,0
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,,1
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,,1
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,,0


## Identify Pure Controls

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all f.3393, f.2247, and f.2257). However these individuals can still be part of the cases

In [124]:
# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return 0
    return 1

In [125]:
f4803_ctrl = filtered[tin_cols].apply(find_ctrl, axis=1)

In [126]:
sum(f4803_ctrl)

118317

### Remove ICD 10 Codes from CTRL

In [127]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24,N,Y,,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51,N,Y,,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33,N,Y,,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408,N,Y,,,,,N,,


In [128]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [129]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212
0,H269,K579,K590,K621,N40,R398,Z466,Z538,,,...,,,,,,,,,,
1,D037,L720,,,,,,,,,...,,,,,,,,,,
2,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,...,,,,,,,,,,
3,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,...,,,,,,,,,,
4,C73,C780,D70,E780,F412,G473,G479,G620,I48,I828,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,...,,,,,,,,,,
182671,E669,I10,K801,K802,K85,M179,M233,N921,R102,,...,,,,,,,,,,
182672,O074,,,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [130]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### Remove ICD 9 Codes from Ctrl

In [131]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
376,f.41271,3880 Degenerative and vascular disorders of ear,0,N,Y,,,,,,,
377,f.41271,3881 Noise effects on inner ear,0,N,Y,,,,,,,
378,f.41271,"3882 Sudden hearing loss, unspecified",0,N,Y,,,,,,,
380,f.41271,3884 Other abnormal auditory perception,0,N,Y,,,,,,,
384,f.41271,3888 Other specified disorders of ear,1,N,Y,,,,,,,
385,f.41271,"3889 Disorders of ear, unspecified",2,N,Y,,,,,,,
388,f.41271,3891 Sensorineural deafness,6,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
389,f.41271,3892 Mixed conductive and sensorineural deafness,1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
390,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
391,f.41271,3898 Other specified forms of deafness,0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,


In [132]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [133]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,3899,4781,8131,E8860,V540,,,,,,...,,,,,,,,,,
182671,,,,,,,,,,,...,,,,,,,,,,
182672,,,,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [134]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### Remove f20002 Codes from Ctrl

In [135]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations


In [136]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1396,1473,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,1075,1440,1473,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1065,1123,1286,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,,,,,,,,,,,...,,,,,,,,,,
182671,1065,,,,,,,,,,...,,,,,,,,,,
182672,1452,1265,1387,,,,,,,,...,,,,,,,,,,
182673,,,,,,,,,,,...,,,,,,,,,,


In [137]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

[]

In [138]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [139]:
sum(ex_10), sum(f4803_ctrl)

(1695, 118317)

In [140]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002)

118634

In [141]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002

In [142]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [143]:
filtered["tinnitus_pure_ctrl"] = filtered_ctrl

  filtered["tinnitus_pure_ctrl"] = filtered_ctrl
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [144]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131230.0.0,f.131231.0.0,f.131232.0.0,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex,tinnitus_pure_ctrl
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,,,,0,1
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,,,,1,0
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,,,,0,0
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,,,,1,1
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,1991-02-25,Primary care only,,,,,,,0,1
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,,,,1,0
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,,,,1,1
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,,,,0,1


### Identify Age and Phenotype Columns

In [145]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [146]:
def get_ctrl_age(row):
    phens = [tin_cols]
    ages = []
    if row["tinnitus_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i != "Prefer not to answer" and tin_ans[i] == 0:
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i != "Prefer not to answer" and tin_ans[i] == 1:
                return row[ages_f21003_col][en]
    else:
        return pd.NA

In [147]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i != "Prefer not to answer" and tin_ans[i] == 1:
            return 1
    return 0

def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [148]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [149]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered[icd_10_cols].apply(tinn_icd10_check_code, axis = 1)

In [150]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered[icd_9_cols].apply(tinn_icd9_check_code, axis = 1)

In [151]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered[self_report_cols].apply(tinn_self_report_check_code, axis = 1)

In [152]:
sum(tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report)

27053

In [153]:
filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)

  filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [154]:
filtered["tinnitus"] = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report

  filtered["tinnitus"] = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report


In [155]:
filtered[ages_f21003_col]

Unnamed: 0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
0,63,,,
1,52,57,60,
2,67,,,
3,41,,,
4,66,,,
...,...,...,...,...
182670,46,,,
182671,56,,,
182672,53,,,
182673,64,,,


In [156]:
filtered["ctrl_age"] = filtered[["tinnitus_pure_ctrl"] + ages_f21003_col + tin_cols].apply(get_ctrl_age, axis=1)

  filtered["ctrl_age"] = filtered[["tinnitus_pure_ctrl"] + ages_f21003_col + tin_cols].apply(get_ctrl_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## File Output

In [158]:
filtered

Unnamed: 0,index,IID,FID,ignore1,ignore2,ignore3,ignore4,f.31.0.0,f.34.0.0,f.53.0.0,...,f.131233.0.0,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,sex,tinnitus_pure_ctrl,tinnitus_age,tinnitus,ctrl_age
0,1,1000035,1000035,0,0,1,-9,Male,1944,2007-11-08,...,,,,,,0,1,63,0,
1,2,1000078,1000078,0,0,2,-9,Female,1955,2007-08-20,...,,,,,,1,0,52,0,60
2,3,1000081,1000081,0,0,1,-9,Male,1942,2009-12-03,...,,,,,,0,0,67,0,67
3,4,1000198,1000198,0,0,2,-9,Female,1967,2009-03-27,...,,,,,,1,1,41,0,
4,5,1000210,1000210,0,0,1,-9,Male,1941,2008-02-11,...,,,,,,0,1,66,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182670,189004,6025295,6025295,0,0,1,-9,Male,1961,2008-01-24,...,,,,,,0,1,46,0,
182671,189005,6025319,6025319,0,0,2,-9,Female,1953,2010-02-03,...,,,,,,1,0,56,0,56
182672,189006,6025346,6025346,0,0,2,-9,Female,1954,2008-08-19,...,,,,,,1,1,53,0,
182673,189007,6025363,6025363,0,0,1,-9,Male,1944,2008-05-17,...,,,,,,0,1,64,0,


In [159]:
filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "ctrl_age"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [160]:
filtered[filtered["tinnitus"] == 1][["FID", "IID", "sex", "tinnitus", "tinnitus_age"]].to_csv("tinnitus_pheno_file.tsv", sep='\t', index=False)

# Merge Pheno with Ctrl

## f2247, f2257, f3393, and mendilian

In [164]:
ctrl_file_name = "pure_ctrl_pheno_file.tsv"
f3393_file_name = "f3393_pheno_file.tsv"
f2247_file_name = "f2247_pheno_file.tsv"
f2257_file_name = "f2257_pheno_file.tsv"
f2247_f2257_file_name = "f2247_f2257_pheno_file.tsv"
mendilianlike_file_name = "mendilian-like_pheno_file.tsv"

In [165]:
f3393 = pd.read_csv(f3393_file_name, sep="\t")
f2247 = pd.read_csv(f2247_file_name, sep="\t")
f2257 = pd.read_csv(f2257_file_name, sep="\t")
f2247_f2257 = pd.read_csv(f2247_f2257_file_name, sep="\t")
ctrl = pd.read_csv(ctrl_file_name, sep="\t")
mendlike = pd.read_csv(mendilianlike_file_name, sep="\t")

In [166]:
print("ctrl: ",len(ctrl))
print("f2247: ",len(f2247))
print("f2257: ",len(f2257))
print("f2247_f2257: ",len(f2247_f2257))
print("f3393: ",len(f3393))
print("mendlike: ",len(mendlike))

ctrl:  98082
f2247:  46237
f2257:  66656
f2247_f2257:  39049
f3393:  6305
mendlike:  1520


### f2247

In [167]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247.columns[3]})
f2247 = f2247.rename(columns={f2247.columns[4]:"age"})

full_pheno = f2247.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_difficulty_f2247_expandedwhite_46237cases_98082ctrl", sep='\t', index=False)

### f2257

In [169]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2257.columns[3]})
f2257 = f2257.rename(columns={f2257.columns[4]:"age"})

full_pheno = f2257.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_noise_f2257_expandedwhite_66656cases_98082ctrl", sep='\t', index=False)

### f2247_f2257

In [170]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247_f2257.columns[3]})
f2247_f2257 = f2247_f2257.rename(columns={f2247_f2257.columns[4]:"age"})

full_pheno = f2247_f2257.append(ctrl)
full_pheno.to_csv("080421_UKBB_Combined_f2247_f2257_expandedwhite_39049cases_98082ctrl", sep='\t', index=False)

### f3393

In [171]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f3393.columns[3]})
f3393 = f3393.rename(columns={f3393.columns[4]:"age"})

full_pheno = f3393.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_aid_f3393_expandedwhite_6305cases_98082ctrl", sep='\t', index=False)

### Mendelian

In [172]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:mendlike.columns[3]})
mendlike = mendlike.rename(columns={mendlike.columns[4]:"age"})

full_pheno = mendlike.append(ctrl)
full_pheno.to_csv("080421_UKBB_Mendelian_expandedwhite_1520cases_98082ctrl", sep='\t', index=False)

## Tinnitus

In [173]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_file.tsv"
tinnitus_file_name = "tinnitus_pheno_file.tsv"

In [174]:
tinnitus_ctrl = pd.read_csv(ctrl_file_name, sep="\t")
tinnitus = pd.read_csv(tinnitus_file_name, sep="\t")

In [175]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:"age", tinnitus_ctrl.columns[3]:tinnitus.columns[3]})
tinnitus = tinnitus.rename(columns={tinnitus.columns[4]:"age"})

In [176]:
print("ctrl: ",len(tinnitus_ctrl))
print("tinnitus: ",len(tinnitus))

ctrl:  98082
tinnitus:  27053


In [178]:
full_pheno = tinnitus.append(tinnitus_ctrl)
full_pheno.to_csv("080421_UKBB_Tinnitus_f4803_expandedwhite_27053cases_62885ctrl", sep='\t', index=False)