# Collect the number of cases with otosclerosis or Meniere's diseas in the 500K and 200K individuals

## Important variables in the otosclerosis definition

[Source of report of H80 (otosclerosis)](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=131251) - f.131251

[Date H80 first reported (otosclerosis)](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=131250) - f.131250

[Diagnoses - ICD10](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=41270) - f.41270 code H.80

[Diagnoses - ICD9](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=41271) -f.41271 code 387

[Non-cancer illness code, self-reported](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=20002) f.20002 code 1420

## Important variables in the Meniere's definition

[Source of report of H81 (disorders of vestibular function)](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=131253)

[Date H81 first reported (disorders of vestibular function)](https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=131252)

## Import libraries

In [136]:
import pandas as pd
import numpy as np
from datetime import datetime

# Read in the data

## Read in database

In [137]:
# collect the necessary column names of the database for our analysis

with open("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    source_oto = [col.strip('"') for col in header if "f.131251" in col]
    date_oto = [col.strip('"') for col in header if "f.131250" in col]
    source_vestibular = [col.strip('"') for col in header if "f.131253" in col] 
    date_vestibular = [col.strip('"') for col in header if "f.131252" in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [138]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + ethnicity + reported_sex + genetic_sex + source_oto + date_oto + source_vestibular + date_vestibular + year_of_birth + month_of_birth

In [139]:
print(datetime.now())

2022-06-29 11:37:17.230039


In [140]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
0,1000019,1000019,Female,1960,November,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000022,1000022,Male,1954,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000035,1000035,Male,1944,May,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [141]:
print(datetime.now())

2022-06-29 11:45:21.774247


In [142]:
filtered = df.dropna(subset=['f.131251.0.0'])

In [143]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
89,1000947,1000947,Female,1941,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2006-12-20,Hospital admissions data only,,
510,1005264,1005264,Male,1954,July,1065,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-07-01,Self-report only,,
548,1005657,1005657,Female,1950,April,1111,1154,1463,1412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001-03-08,Primary care only,,
1006,1010464,1010464,Female,1952,February,1265,99999,1385,99999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2003-01-16,Hospital admissions data only,,
1322,1013752,1013752,Male,1947,May,1517,1196,1265,1436,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1964-01-01,Primary care only,2014-06-06,Primary care only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483585,5996216,5996216,Male,1944,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-01-08,Hospital admissions data only,,
483665,5997035,5997035,Female,1963,October,1094,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2004-04-22,Hospital admissions data only,,
484179,6002333,6002333,Male,1946,April,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-04-01,Primary care only,,
484853,6009310,6009310,Female,1952,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2004-01-22,Hospital admissions data only,,


In [144]:
f131251_id = filtered['IID']

## Where do the reports on otosclerosis come from?

In [145]:
print(filtered['f.131251.0.0'].value_counts())

Primary care only                               460
Hospital admissions data only                   279
Self-report only                                161
Primary care and other source(s)                 82
Self-report and other source(s)                  67
Hospital admissions data and other source(s)     15
Name: f.131251.0.0, dtype: Int64


In [146]:
print(filtered['f.131250.0.0'].value_counts()) 

2007-07-01    9
1988-01-01    9
2002-07-01    8
1998-07-01    8
1993-01-01    8
             ..
1998-12-01    1
1990-12-05    1
2012-09-13    1
2005-11-02    1
2004-01-22    1
Name: f.131250.0.0, Length: 814, dtype: Int64


## Find cases in the ICD10 codes

In [147]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in filtered if "f.41270" in col]

In [148]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = df[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,F101,J342,R619,S8280,W010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,Z538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,M4782,M5499,M7989,N133,N179,N200,N201,N209,N390,N820,R42,R798,S7200,T831,W010,Y831,Y95,Z089,Z510,Z511,Z513,Z530,Z855,Z871,Z907,Z936,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,M060,M069,M179,M199,M2550,M819,R104,R11,R13,R410,R509,R590,R619,R634,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,O149,O266,O342,O471,O48,O610,O680,Z370,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,G551,M501,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Information on these codes can be found here https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=19

```
H80	H80 Otosclerosis	39530	1110	N
H800	H80.0 Otosclerosis involving oval window, nonobliterative	39540	39530	Y
H801	H80.1 Otosclerosis involving oval window, obliterative	39550	39530	Y
H802	H80.2 Cochlear otosclerosis	39560	39530	Y
H808	H80.8 Other otosclerosis	39570	39530	Y
H809	H80.9 Otosclerosis, unspecified	39580	39530	Y
```

In [149]:
# Individuals to be included in the otosclerosis count
incl_oto = [ 'H80', 'H800', 'H801', "H802", "H808", "H809"]

In [150]:
# returns if the current individual should be included based on the inclusion list
def contains_code(row, incl_oto):
    for i in row:
        if not pd.isna(i) and i in incl_oto:
            return True
            
    return False

In [151]:
#collect the individuals that should be included because of icd10
incl_fxn_icd10 = lambda row: contains_code(row, incl_oto)

In [152]:
incl_10 = icd10.apply(incl_fxn_icd10, axis=1)

In [153]:
incl_10

0         False
1         False
2         False
3         False
4         False
          ...  
486411    False
486412    False
486413    False
486414    False
486415    False
Length: 486416, dtype: bool

In [154]:
icd10_filtered = df[incl_10]

In [155]:
icd10_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
89,1000947,1000947,Female,1941,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2006-12-20,Hospital admissions data only,,
1006,1010464,1010464,Female,1952,February,1265,99999,1385,99999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2003-01-16,Hospital admissions data only,,
1811,1018816,1018816,Female,1954,May,1111,1367,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-08-17,Hospital admissions data only,,
3399,1035210,1035210,Female,1959,March,1111,1226,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-02-05,Hospital admissions data only,,
4345,1045073,1045073,Male,1950,March,1065,1419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2012-09-05,Hospital admissions data only,2012-09-07,Hospital admissions data only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480671,5966131,5966131,Male,1954,April,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-08-10,Hospital admissions data only,,
480713,5966577,5966577,Male,1956,October,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-04,Hospital admissions data only,,
483585,5996216,5996216,Male,1944,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-01-08,Hospital admissions data only,,
483665,5997035,5997035,Female,1963,October,1094,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2004-04-22,Hospital admissions data only,,


In [156]:
icd10_id = icd10_filtered['IID']

In [157]:
print(icd10_filtered['f.131251.0.0'].value_counts())

Hospital admissions data only                   279
Primary care and other source(s)                 74
Self-report and other source(s)                  55
Hospital admissions data and other source(s)     15
Primary care only                                 1
Name: f.131251.0.0, dtype: Int64


In [158]:
# Two ICD10 cases for otosclerosis are NA for f.131251
icd10_filtered[icd10_filtered['f.131251.0.0'].isna()]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
105402,2089419,2089419,Male,1945,January,1065.0,1473.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
267552,3764605,3764605,Male,1944,December,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Find cases ICD9

In [159]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [160]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = df[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,f.41271.0.10,f.41271.0.11,f.41271.0.12,f.41271.0.13,f.41271.0.14,f.41271.0.15,f.41271.0.16,f.41271.0.17,f.41271.0.18,f.41271.0.19,f.41271.0.20,f.41271.0.21,f.41271.0.22,f.41271.0.23,f.41271.0.24,f.41271.0.25,f.41271.0.26,f.41271.0.27,f.41271.0.28,f.41271.0.29,f.41271.0.30,f.41271.0.31,f.41271.0.32,f.41271.0.33,f.41271.0.34,f.41271.0.35,f.41271.0.36,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3000,5198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Information taken from resource https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=87

```
387	387 Otosclerosis	3318	66	N
3870	3870 Otosclerosis involving oval window, nonobliterative	3319	3318	Y
3871	3871 Otosclerosis involving oval window, obliterative	3320	3318	Y
3872	3872 Cochlear otosclerosis	3321	3318	Y
3878	3878 Other specified otosclerosis	3322	3318	Y
3879	3879 Otosclerosis, unspecified	3323	3318	Y
```

In [161]:
# Individuals to be included in the otosclerosis count
incl_oto_9 = ['387','3870', '3871', '3872', '3878', '3879']

In [162]:
# returns if the current individual should be included based on the inclusion list
def contains_code(row, incl_oto_9):
    for i in row:
        if not pd.isna(i) and i in incl_oto_9:
            return True
            
    return False

In [163]:
#collect the individuals that should be included because of icd10
incl_fxn_icd9 = lambda row: contains_code(row, incl_oto_9)

In [164]:
incl_9 = icd9.apply(incl_fxn_icd9, axis=1)

In [165]:
icd9_filtered = df[incl_9]

In [166]:
icd9_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
50178,1518964,1518964,Female,1952,May,1065.0,1138.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
66312,1685646,1685646,Male,1938,December,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1992-02-11,Primary care only,1980-02-01,Primary care only
74122,1766142,1766142,Female,1952,January,1077.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
81795,1845280,1845280,Male,1951,April,1111.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
86794,1897067,1897067,Female,1952,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
97548,2008079,2008079,Female,1943,November,1074.0,1065.0,1111.0,1474.0,1473.0,1311.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
97808,2010744,2010744,Female,1946,April,1464.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1982-07-02,Primary care only,,
103798,2072693,2072693,Female,1947,December,1135.0,1415.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
128298,2325910,2325910,Female,1948,January,1458.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1983-06-01,Primary care only,,
153650,2587613,2587613,Male,1963,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1997-04-17,Hospital admissions data only,,


In [167]:
icd9_id = icd9_filtered['IID']

In [168]:
print(icd9_filtered['f.131251.0.0'].value_counts())

Primary care only                   9
Self-report only                    3
Hospital admissions data only       2
Self-report and other source(s)     2
Primary care and other source(s)    1
Name: f.131251.0.0, dtype: Int64


In [169]:
# 26 ICD9 cases for otosclerosis are NA for f.131251
icd9_filtered[icd9_filtered['f.131251.0.0'].isna()]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
50178,1518964,1518964,Female,1952,May,1065.0,1138.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
74122,1766142,1766142,Female,1952,January,1077.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
81795,1845280,1845280,Male,1951,April,1111.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
86794,1897067,1897067,Female,1952,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
97548,2008079,2008079,Female,1943,November,1074.0,1065.0,1111.0,1474.0,1473.0,1311.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103798,2072693,2072693,Female,1947,December,1135.0,1415.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
158858,2641222,2641222,Female,1939,May,1065.0,99999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
179107,2850448,2850448,Male,1941,July,1111.0,1473.0,1440.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
197270,3038081,3038081,Female,1950,July,1074.0,1111.0,1465.0,1466.0,1415.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
201043,3077095,3077095,Male,1944,May,1072.0,1473.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [170]:
icd9_filtered[icd9_filtered["IID"].isin(f131251_id)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
66312,1685646,1685646,Male,1938,December,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1992-02-11,Primary care only,1980-02-01,Primary care only
97808,2010744,2010744,Female,1946,April,1464.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1982-07-02,Primary care only,,
128298,2325910,2325910,Female,1948,January,1458.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1983-06-01,Primary care only,,
153650,2587613,2587613,Male,1963,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1997-04-17,Hospital admissions data only,,
205882,3127034,3127034,Female,1943,July,1111.0,1440.0,1440.0,1465.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1989-01-01,Primary care only,,
208829,3157389,3157389,Female,1953,April,1465.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984-08-06,Primary care only,,
243072,3511583,3511583,Female,1939,January,1111.0,1264.0,99999.0,99999.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1992-08-21,Primary care only,,
243529,3516320,3516320,Male,1942,December,1081.0,1065.0,1396.0,1465.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1986-09-18,Primary care only,,
276671,3858626,3858626,Female,1950,March,1420.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1979-07-01,Self-report only,,
297197,4070664,4070664,Female,1956,March,1420.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1991-09-01,Self-report only,,


## Find cases self-report

In [171]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = set([col for col in filtered if "f.20002" in col])

In [172]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = df[f20002_colnames]
f20002

Unnamed: 0,f.20002.3.9,f.20002.1.31,f.20002.1.30,f.20002.3.7,f.20002.1.6,f.20002.3.27,f.20002.3.10,f.20002.1.4,f.20002.1.17,f.20002.3.24,f.20002.0.28,f.20002.0.3,f.20002.1.5,f.20002.1.0,f.20002.3.2,f.20002.3.31,f.20002.3.30,f.20002.3.6,f.20002.3.15,f.20002.0.6,f.20002.1.23,f.20002.3.8,f.20002.0.27,f.20002.2.24,f.20002.3.20,f.20002.2.31,f.20002.0.32,f.20002.2.21,f.20002.1.13,f.20002.1.15,f.20002.2.10,f.20002.2.3,f.20002.2.33,f.20002.2.32,f.20002.1.26,f.20002.2.15,f.20002.3.26,f.20002.2.30,f.20002.3.1,f.20002.1.33,...,f.20002.1.2,f.20002.3.22,f.20002.2.20,f.20002.3.14,f.20002.1.9,f.20002.2.9,f.20002.1.14,f.20002.3.13,f.20002.0.13,f.20002.0.0,f.20002.1.8,f.20002.1.11,f.20002.2.17,f.20002.2.18,f.20002.2.26,f.20002.3.3,f.20002.0.11,f.20002.2.14,f.20002.0.9,f.20002.1.10,f.20002.3.12,f.20002.3.17,f.20002.0.4,f.20002.1.12,f.20002.0.24,f.20002.1.24,f.20002.2.27,f.20002.3.0,f.20002.0.12,f.20002.2.1,f.20002.0.16,f.20002.0.18,f.20002.0.1,f.20002.0.17,f.20002.2.19,f.20002.2.8,f.20002.2.25,f.20002.3.4,f.20002.3.16,f.20002.2.2
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1396,,,,,,,,,,,,,,,,,,,,,,,1473,,,,,,,
3,,,,,,,,,,,,1473,,,,,,,,,,,,,,,,,,,,1571,,,,,,,,,...,,,,,,,,,,1065,,,,,,,,,,,,,1374,,,,,,,1223,,,1294,,,,,,,1473
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,1627,,,,,,,,,,,,,,,,,,1599,,,,,,,,,...,1473,,,,,,,,,1478,,,,,,,,,,,,,,,,,,,,1538,,,1473,,,,,,,1065
486413,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Information about these codes can be found here https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=6

```
1420	otosclerosis	1474	1469	Y
```

In [173]:
# Individuals to be included in the otosclerosis count
incl_oto_self = ['1420']

In [174]:
# returns if the current individual should be included based on the inclusion list
def contains_code(row, incl_oto_self):
    for i in row:
        if not pd.isna(i) and i in incl_oto_self:
            return True
            
    return False

In [175]:
# collect the individuals that should be included because of self-report
incl_fxn_f20002 = lambda row: contains_code(row, incl_oto_self)
incl_f20002 = f20002.apply(incl_fxn_f20002, axis=1)

In [176]:
f20002_filtered = df[incl_f20002]

In [177]:
f20002_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
213,1002216,1002216,Male,1945,June,1223,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
510,1005264,1005264,Male,1954,July,1065,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-07-01,Self-report only,,
2421,1025150,1025150,Female,1942,December,1420,1417,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988-07-01,Self-report and other source(s),,
8287,1085876,1085876,Female,1953,March,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1973-09-01,Self-report only,,
10518,1109006,1109006,Female,1951,July,1277,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1987-03-25,Primary care and other source(s),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472973,5886610,5886610,Male,1947,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1312,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2002-03-01,Primary care and other source(s),,
476631,5924460,5924460,Female,1964,April,1065,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1978-10-01,Self-report and other source(s),,
476839,5926575,5926575,Female,1967,March,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1999-07-01,Self-report only,,
478217,5940765,5940765,Female,1957,January,1065,1111,1420,1387,1452,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-01,Self-report and other source(s),,


In [178]:
print(f20002_filtered['f.131251.0.0'].value_counts())

Self-report only                                161
Self-report and other source(s)                  67
Primary care and other source(s)                 13
Hospital admissions data and other source(s)      2
Name: f.131251.0.0, dtype: Int64


In [179]:
# 26 ICD9 cases for otosclerosis are NA for f.131251
f20002_filtered[f20002_filtered['f.131251.0.0'].isna()]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
213,1002216,1002216,Male,1945,June,1223,1420.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
149827,2548256,2548256,Male,1955,December,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
223193,3305874,3305874,Female,1961,February,99999,99999.0,1421.0,1265.0,1286.0,1416.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-12-09,Primary care and other source(s)


In [180]:
f20002_id = f20002_filtered['IID']

## Find unique IID's in source oto, icd10, icd9 and self-report databases

In [181]:
df_union= pd.concat([f131251_id, icd10_id, icd9_id, f20002_id]).drop_duplicates()

In [182]:
df_union

89        1000947
510       1005264
548       1005657
1006      1010464
1322      1013752
           ...   
480772    5967193
483939    5999872
213       1002216
149827    2548256
223193    3305874
Name: IID, Length: 1095, dtype: string

# Find Meniere's disease cases

### Find cases with Meniere in ICD10 codes

```
H81	H81 Disorders of vestibular function	39590	1110	N
H810	H81.0 Meniere's disease	39600	39590	Y
H811	H81.1 Benign paroxysmal vertigo	39610	39590	Y
H812	H81.2 Vestibular neuronitis	39620	39590	Y
H813	H81.3 Other peripheral vertigo	39630	39590	Y
H814	H81.4 Vertigo of central origin	39640	39590	Y
H818	H81.8 Other disorders of vestibular function	39650	39590	Y
H819	H81.9 Disorder of vestibular function, unspecified	39660	39590	Y
```

In [183]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in filtered if "f.41270" in col]

In [184]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = df[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,F101,J342,R619,S8280,W010,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,Z538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,M4782,M5499,M7989,N133,N179,N200,N201,N209,N390,N820,R42,R798,S7200,T831,W010,Y831,Y95,Z089,Z510,Z511,Z513,Z530,Z855,Z871,Z907,Z936,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,M060,M069,M179,M199,M2550,M819,R104,R11,R13,R410,R509,R590,R619,R634,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,O149,O266,O342,O471,O48,O610,O680,Z370,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,G551,M501,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [185]:
# Individuals to be included in the otosclerosis count
incl_meniere = [ 'H810']

In [186]:
# returns if the current individual should be included based on the inclusion list
def contains_code(row, incl_meniere):
    for i in row:
        if not pd.isna(i) and i in incl_meniere:
            return True
            
    return False

In [187]:
#collect the individuals that should be included because of icd10
incl_fxn_icd10 = lambda row: contains_code(row, incl_meniere)

In [188]:
incl_10_meniere = icd10.apply(incl_fxn_icd10, axis=1)

In [189]:
incl_10_meniere

0         False
1         False
2         False
3         False
4         False
          ...  
486411    False
486412    False
486413    False
486414    False
486415    False
Length: 486416, dtype: bool

In [190]:
icd10_meniere = df[incl_10_meniere]

In [191]:
icd10_meniere

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
331,1003440,1003440,Female,1945,February,1309,1294,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-04-27,Primary care and other source(s)
452,1004673,1004673,Female,1941,April,1465,1197,1312,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009-02-02,Primary care and other source(s)
1470,1015307,1015307,Female,1947,September,1421,1080,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1987-07-01,Self-report and other source(s)
1992,1020665,1020665,Male,1951,August,1534,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2597,1026994,1026994,Female,1944,February,1065,1465,1257,1138,1415,1312,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2006-11-01,Primary care and other source(s)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
484424,6004899,6004899,Female,1941,December,1465,1474,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013-01-17,Primary care and other source(s)
485916,6020276,6020276,Female,1952,July,1065,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-10-25,Hospital admissions data only
485955,6020690,6020690,Female,1949,March,1065,1162,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-07-01,Self-report and other source(s)
486074,6021916,6021916,Female,1944,May,1065,1093,1220,1126,1294,1286,1138,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-02-06,Hospital admissions data only


In [192]:
icd10_meniere_id = icd10_meniere['IID']

In [193]:
print(icd10_meniere['f.131253.0.0'].value_counts())

Hospital admissions data only                   516
Self-report and other source(s)                 423
Primary care and other source(s)                185
Hospital admissions data and other source(s)     29
Self-report only                                  7
Primary care only                                 7
Name: f.131253.0.0, dtype: Int64


In [194]:
# There are 8 individuals with NA for f.131253 that have meniere's icd10 code
meniere_na= icd10_meniere[icd10_meniere['f.131253.0.0'].isna()]

In [195]:
meniere_na[icd10_colnames ]

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
1992,C443,D120,D127,H810,I849,K529,K573,K621,K635,N201,N202,Z860,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
103785,B972,F412,H810,I10,J128,U071,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123986,B980,D508,D510,D519,E210,E538,E669,E780,E785,E789,E833,E871,E876,F059,F329,H045,H250,H353,H521,H547,H549,H810,I080,I10,I859,J181,K20,K210,K219,K221,K227,K259,K295,K297,K409,K449,K566,K573,K579,K590,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
212529,B370,B962,B968,C61,D122,D123,D124,D125,D509,D519,D649,E039,E559,E668,E669,G473,H810,H919,I10,I493,I500,J181,J22,J449,J459,J90,K219,K259,K295,K298,K317,K573,K621,K635,K660,L033,L890,M069,M139,M159,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
278693,A415,B962,C64,E039,E669,E871,F329,G401,G409,G439,G938,H024,H269,H400,H401,H409,H598,H810,H919,I10,I440,I64,I714,I719,K219,K449,K573,M139,M1396,M171,M179,M199,N179,N183,N189,N288,N390,R074,R410,S0650,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
381712,C64,D410,D649,E780,H810,I10,I251,I259,J459,K590,K769,N131,N179,N281,N40,R104,R590,R934,Z115,Z466,Z921,Z951,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
393063,A099,B349,B379,E119,E669,E780,F329,F419,G473,H810,I10,I208,I209,I251,I259,I447,I509,I518,J010,J069,J22,J329,J348,K219,K449,K590,K629,K803,K808,K900,L982,M069,M139,M1391,M179,M1997,M2323,M2326,M2591,M4807,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
436573,D124,F329,G409,H810,H919,I10,K210,K573,K625,K635,K648,M1394,M1397,M169,M1994,M1997,M201,M2557,M2574,M2577,M480,M511,M653,M674,M679,R11,R42,R51,Z115,Z864,Z922,Z966,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Find cases with Meniere in ICD9 codes

In [196]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [197]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = df[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,f.41271.0.10,f.41271.0.11,f.41271.0.12,f.41271.0.13,f.41271.0.14,f.41271.0.15,f.41271.0.16,f.41271.0.17,f.41271.0.18,f.41271.0.19,f.41271.0.20,f.41271.0.21,f.41271.0.22,f.41271.0.23,f.41271.0.24,f.41271.0.25,f.41271.0.26,f.41271.0.27,f.41271.0.28,f.41271.0.29,f.41271.0.30,f.41271.0.31,f.41271.0.32,f.41271.0.33,f.41271.0.34,f.41271.0.35,f.41271.0.36,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,3000,5198,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


The information was taken from icd9 coding

```
386	386 Vertiginous syndromes and other disorders of vestibular system	3309	66	N
3860	3860 Meniere's disease	3310	3309	Y
3861	3861 Other and unspecified peripheral vertigo	3311	3309	Y
3862	3862 Vertigo of central origin	3312	3309	Y
3863	3863 Labyrinthitis	3313	3309	Y
3864	3864 Labyrinthine fistula	3314	3309	Y
3865	3865 Labyrinthine dysfunction	3315	3309	Y
3868	3868 Other disorders of labyrinth	3316	3309	Y
3869	3869 Unspecified vertiginous syndromes and labyrinthine disorders	3317	3309	Y
```

In [198]:
# Individuals to be included in the otosclerosis count
incl_icd9_meniere = ['3860']

In [199]:
# returns if the current individual should be excluded based on the exclusion list
def contains_code(row, incl_icd9_meniere):
    for i in row:
        if not pd.isna(i) and i in incl_icd9_meniere:
            return True
            
    return False

In [200]:
#collect the individuals that should be excluded because of icd10
incl_fxn_icd9 = lambda row: contains_code(row, incl_icd9_meniere)

In [201]:
incl_9_meniere = icd9.apply(incl_fxn_icd9, axis=1)

In [202]:
icd9_meniere = df[incl_9_meniere]

In [203]:
icd9_meniere

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
36276,1375204,1375204,Male,1958,November,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1989-04-30,Primary care only
170466,2761203,2761203,Female,1941,August,1065,1226.0,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1980-07-01,Self-report and other source(s)
181345,2873516,2873516,Female,1948,August,1075,1074.0,1464.0,1421.0,1226.0,1154.0,1474.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988-02-01,Self-report and other source(s)
181530,2875409,2875409,Male,1944,July,1065,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1989-02-25,Primary care and other source(s)
233611,3413774,3413774,Female,1946,April,1065,1094.0,1226.0,1465.0,1452.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1991-06-01,Primary care only
341860,4532589,4532589,Female,1941,April,1465,1072.0,1278.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988-01-12,Primary care only
426870,5410383,5410383,Female,1947,October,1482,1287.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,1990-03-29,1990-01-11,1984-12-21,1985-05-17,1989-05-24,1988-06-15,1982-08-16,1984-12-21,1987-04-15,1982-08-16,1987-04-15,1988-06-15,1984-12-21,1987-10-07,,,,,,,,,,,,,,,,,,,,,,,,,,
427301,5414813,5414813,Female,1939,June,1075,1421.0,1154.0,1465.0,1295.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1996-01-01,Self-report and other source(s)
438149,5526977,5526977,Female,1943,October,1225,1459.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1976-04-01,Self-report and other source(s)
450387,5653637,5653637,Female,1948,October,1473,1286.0,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1962-04-01,Self-report and other source(s)


In [204]:
print(icd9_meniere['f.131253.0.0'].value_counts())

Self-report and other source(s)     7
Primary care only                   3
Primary care and other source(s)    1
Name: f.131253.0.0, dtype: Int64


In [205]:
icd9_meniere[icd9_meniere['f.131253.0.0'].isna()]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
426870,5410383,5410383,Female,1947,October,1482,1287,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,1990-03-29,1990-01-11,1984-12-21,1985-05-17,1989-05-24,1988-06-15,1982-08-16,1984-12-21,1987-04-15,1982-08-16,1987-04-15,1988-06-15,1984-12-21,1987-10-07,,,,,,,,,,,,,,,,,,,,,,,,,,


In [206]:
icd9_meniere_id = icd9_meniere['IID']

### Find cases with Meniere in self-report

In [207]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = set([col for col in filtered if "f.20002" in col])

In [208]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = df[f20002_colnames]
f20002

Unnamed: 0,f.20002.3.9,f.20002.1.31,f.20002.1.30,f.20002.3.7,f.20002.1.6,f.20002.3.27,f.20002.3.10,f.20002.1.4,f.20002.1.17,f.20002.3.24,f.20002.0.28,f.20002.0.3,f.20002.1.5,f.20002.1.0,f.20002.3.2,f.20002.3.31,f.20002.3.30,f.20002.3.6,f.20002.3.15,f.20002.0.6,f.20002.1.23,f.20002.3.8,f.20002.0.27,f.20002.2.24,f.20002.3.20,f.20002.2.31,f.20002.0.32,f.20002.2.21,f.20002.1.13,f.20002.1.15,f.20002.2.10,f.20002.2.3,f.20002.2.33,f.20002.2.32,f.20002.1.26,f.20002.2.15,f.20002.3.26,f.20002.2.30,f.20002.3.1,f.20002.1.33,...,f.20002.1.2,f.20002.3.22,f.20002.2.20,f.20002.3.14,f.20002.1.9,f.20002.2.9,f.20002.1.14,f.20002.3.13,f.20002.0.13,f.20002.0.0,f.20002.1.8,f.20002.1.11,f.20002.2.17,f.20002.2.18,f.20002.2.26,f.20002.3.3,f.20002.0.11,f.20002.2.14,f.20002.0.9,f.20002.1.10,f.20002.3.12,f.20002.3.17,f.20002.0.4,f.20002.1.12,f.20002.0.24,f.20002.1.24,f.20002.2.27,f.20002.3.0,f.20002.0.12,f.20002.2.1,f.20002.0.16,f.20002.0.18,f.20002.0.1,f.20002.0.17,f.20002.2.19,f.20002.2.8,f.20002.2.25,f.20002.3.4,f.20002.3.16,f.20002.2.2
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1396,,,,,,,,,,,,,,,,,,,,,,,1473,,,,,,,
3,,,,,,,,,,,,1473,,,,,,,,,,,,,,,,,,,,1571,,,,,,,,,...,,,,,,,,,,1065,,,,,,,,,,,,,1374,,,,,,,1223,,,1294,,,,,,,1473
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,,,,,,,,,,,,,,1627,,,,,,,,,,,,,,,,,,1599,,,,,,,,,...,1473,,,,,,,,,1478,,,,,,,,,,,,,,,,,,,,1538,,,1473,,,,,,,1065
486413,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Information about these codes can be found here https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=6

```
1421	meniere's disease
```

In [209]:
# Individuals to be included in the otosclerosis count
incl_meniere_self = ['1421']

In [210]:
# returns if the current individual should be included based on the inclusion list
def contains_code(row, incl_meniere_self):
    for i in row:
        if not pd.isna(i) and i in incl_meniere_self:
            return True
            
    return False

In [211]:
# collect the individuals that should be included because of self-report
incl_fxn_f20002 = lambda row: contains_code(row, incl_meniere_self)
incl_f20002 = f20002.apply(incl_fxn_f20002, axis=1)

In [212]:
f20002_meniere = df[incl_f20002]

In [213]:
f20002_meniere

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
619,1006389,1006389,Male,1940,November,1065,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-01,Self-report only
1470,1015307,1015307,Female,1947,September,1421,1080,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1987-07-01,Self-report and other source(s)
1835,1019052,1019052,Female,1951,September,1111,1265,1261,1226,1421,1457,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988-07-01,Self-report only
2104,1021814,1021814,Female,1941,January,1082,1072,1473,1465,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2006-07-01,Self-report only
2491,1025878,1025878,Female,1951,September,1421,1374,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001-07-01,Self-report only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483627,5996643,5996643,Female,1939,August,1065,1465,1458,1286,1421,1278,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1983-07-01,Self-report only
483671,5997090,5997090,Male,1947,February,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1984-08-01,Self-report and other source(s)
485127,6012123,6012123,Female,1958,September,1421,1286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-07-01,Self-report and other source(s)
485955,6020690,6020690,Female,1949,March,1065,1162,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-07-01,Self-report and other source(s)


In [214]:
print(f20002_meniere['f.131253.0.0'].value_counts())

Self-report only                                670
Self-report and other source(s)                 592
Primary care and other source(s)                146
Hospital admissions data and other source(s)     17
Primary care only                                 4
Hospital admissions data only                     3
Name: f.131253.0.0, dtype: Int64


In [215]:
f20002_meniere[f20002_meniere['f.131253.0.0'].isna()]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
103797,2072689,2072689,Female,1949,January,99999,99999.0,99999.0,99999.0,1242.0,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
122440,2265470,2265470,Female,1940,March,1286,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
190092,2963984,2963984,Male,1944,January,1065,1473.0,1421.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
268969,3779273,3779273,Male,1949,May,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
429804,5440686,5440686,Male,1949,April,1094,1593.0,1421.0,1474.0,1597.0,1387.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
432615,5469730,5469730,Male,1940,October,1111,1278.0,1421.0,1465.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [216]:
f20002_meniere_id = f20002_meniere['IID']

### Find unique IID's for cases with meniere's in icd10, icd9 and self-report

In [217]:
df_meniere = pd.concat([icd10_meniere_id, icd9_meniere_id, f20002_meniere_id]).drop_duplicates()

In [218]:
df_meniere

331       1003440
452       1004673
1470      1015307
1992      1020665
2597      1026994
           ...   
483417    5994513
483613    5996503
483627    5996643
485127    6012123
486020    6021359
Name: IID, Length: 2116, dtype: string

In [219]:
df_meniere[df_meniere.isin(meniere_filtered['IID'])]

331       1003440
452       1004673
1470      1015307
2597      1026994
2746      1028530
           ...   
483417    5994513
483613    5996503
483627    5996643
485127    6012123
486020    6021359
Name: IID, Length: 2101, dtype: string

### Find the cases that have information for f.131251 disorders of vestibular function

In [220]:
meniere_filtered = df.dropna(subset=['f.131253.0.0'])

In [221]:
meniere_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
59,1000642,1000642,Female,1958,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-06-23,Primary care only
87,1000920,1000920,Female,1951,January,1113,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2008-03-25,Primary care only
93,1000992,1000992,Male,1960,May,1111,1474,1312,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013-10-23,Primary care only
96,1001028,1001028,Female,1942,December,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-09-13,Primary care only
160,1001687,1001687,Female,1943,February,1065,1220,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001-06-18,Primary care only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486074,6021916,6021916,Female,1944,May,1065,1093,1220,1126,1294,1286,1138,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-02-06,Hospital admissions data only
486085,6022039,6022039,Female,1952,November,1074,1065,1506,1373,1286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013-09-23,Primary care only
486087,6022050,6022050,Female,1956,March,1416,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013-02-13,Primary care only
486189,6023090,6023090,Male,1949,April,1074,1065,1111,1473,1138,1474,99999,1465,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-06-02,Primary care only


# Apply exclusion criteria, genotype QC and remove PCA outliers for white European

## Read in exclusion criteria for icd10, icd9, and self-report

In [92]:
# csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32.0,N,N,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218.0,N,N,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49.0,N,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218.0,Y,,,,,,,
567,f.20002,1583 ischaemic stroke,44.0,N,N,,,,,,
568,f.20002,1082 transient ischaemic attack (tia),2243.0,N,N,,,,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212.0,Y,,,,,,,


## Read in individuals in genotype array QC

In [93]:
qc_individuals = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/results/092821_PCA_results_500K/white_europeans/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.fam", sep="\t", header=None)
qc_individuals

Unnamed: 0,0,1,2,3,4,5
0,1000019,1000019,0,0,2,-9
1,1000022,1000022,0,0,1,-9
2,1000035,1000035,0,0,1,-9
3,1000046,1000046,0,0,2,-9
4,1000054,1000054,0,0,2,-9
...,...,...,...,...,...,...
460644,6025390,6025390,0,0,2,-9
460645,6025409,6025409,0,0,2,-9
460646,6025411,6025411,0,0,2,-9
460647,6025425,6025425,0,0,2,-9


## Read in PCA outlier file

In [94]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/results/092821_PCA_results_500K/092821_PCA_related_pval0.005/ukb47922_white_460649ind.092821_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1003423,1003423
1,1008606,1008606
2,1009852,1009852
3,1010412,1010412
4,1010678,1010678
...,...,...
1377,5801962,5801962
1378,5807807,5807807
1379,5809112,5809112
1380,5833189,5833189


## Define individuals with exome data

In [95]:
individs_200k = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.fam", header=None, sep=" ")

## Remove individuals that do not match for reported and genetic sex

In [96]:
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]

In [97]:
# returns true only if 
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]

In [98]:
# exclusion based on inconsistent sex
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)

In [99]:
df_filtered = df[~ex_sex]

In [100]:
print(sum(ex_sex), "of 500k individuals removed because of inconsistency with the genetic and reported sex variables")

0 of 500k individuals removed because of inconsistency with the genetic and reported sex variables


In [101]:
df_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0
0,1000019,1000019,Female,1960,November,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000022,1000022,Male,1954,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000035,1000035,Male,1944,May,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Remove non-white individuals

In [102]:
# set of answers for the ethnicity question
set(df_filtered[ethnicity[0]].to_list()).union( set(df_filtered[ethnicity[1]].to_list()) , set(df_filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [103]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

Set an ancestry row that combines the ancestry answers from the database into one 

In [104]:
df_filtered["ethnicity"] = df_filtered[ethnicity].apply(ancestry, axis=1)



In [107]:
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white" and row["ethnicity"] != "Any_other_white_background"

In [108]:
ex_non_white = df_filtered[["ethnicity"]].apply(find_non_white, axis=1)

In [109]:
df_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
0,1000019,1000019,Female,1960,November,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


In [110]:
df_filtered = df_filtered[~ex_non_white]

In [111]:
print(sum(ex_non_white), "of 500k individuals removed for being non-white")

25767 of 500k individuals removed for being non-white


In [112]:
df_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
0,1000019,1000019,Female,1960,November,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


##  Only keep individuals that passed genotype array QC

In [113]:
qc_list = set([str(i) for i in qc_individuals[0].to_list()])
def matches_qc_individuals(row):
    return row["FID"] in qc_list

In [114]:
df_filtered = df_filtered[df_filtered[["FID"]].apply(matches_qc_individuals, axis=1)]

## Remove PCA outliers from the full database

In [115]:
# since the IID from the dataframe is in string the outlier ids have to be made into string as well
out_ids = [str(x) for x in outlier[0].to_list()] 

def find_outliers(row):
    return row["IID"] in out_ids

In [116]:
ex_pca_outliers = df_filtered[["IID", "FID"]].apply(find_outliers, axis=1)

In [117]:
df_filtered = df_filtered[~ex_pca_outliers]

In [118]:
print(sum(ex_pca_outliers), "of 500k individuals removed for being pca outliers")

1382 of 500k individuals removed for being pca outliers


In [119]:
df_filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
0,1000019,1000019,Female,1960,November,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,1396,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,1478,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1627,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British


# Find number of Otosclerosis cases before applying exclusion criteria

In [128]:
# This is the number of cases with Otosclerosis in the 500K sample of white Europeans after sample QC but before applying any exclusion criteria
df_filtered[df_filtered['IID'].isin(df_union)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
89,1000947,1000947,Female,1941,August,1065,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2006-12-20,Hospital admissions data only,,,British
213,1002216,1002216,Male,1945,June,1223,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,British
510,1005264,1005264,Male,1954,July,1065,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-07-01,Self-report only,,,British
548,1005657,1005657,Female,1950,April,1111,1154,1463,1412,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001-03-08,Primary care only,,,British
1006,1010464,1010464,Female,1952,February,1265,99999,1385,99999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2003-01-16,Hospital admissions data only,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483585,5996216,5996216,Male,1944,March,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-01-08,Hospital admissions data only,,,British
483939,5999872,5999872,Female,1951,September,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Irish
484179,6002333,6002333,Male,1946,April,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-04-01,Primary care only,,,British
484853,6009310,6009310,Female,1952,November,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2004-01-22,Hospital admissions data only,,,British


In [124]:
individs_200k = pd.read_csv("~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.fam", header=None, sep=" ")
individs_200k = df_filtered[df_filtered["FID"].isin(set([str(j) for j in individs_200k[0].to_list()]))]["FID"].tolist()

In [126]:
len(individs_200k)

187903

In [131]:
subset_200k = df_filtered[df_filtered['IID'].isin(individs_200k)]

In [133]:
subset_200k[subset_200k['IID'].isin(df_union)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
510,1005264,1005264,Male,1954,July,1065,1420,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1981-07-01,Self-report only,,,British
1006,1010464,1010464,Female,1952,February,1265,99999,1385,99999,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2003-01-16,Hospital admissions data only,,,British
1322,1013752,1013752,Male,1947,May,1517,1196,1265,1436,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1964-01-01,Primary care only,2014-06-06,Primary care only,British
1495,1015569,1015569,Female,1958,July,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1989-01-01,Primary care only,,,British
2826,1029344,1029344,Female,1950,February,1111,1225,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1976-01-01,Primary care only,,,Irish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478217,5940765,5940765,Female,1957,January,1065,1111,1420,1387,1452,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-01,Self-report and other source(s),,,British
480671,5966131,5966131,Male,1954,April,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1998-08-10,Hospital admissions data only,,,British
482794,5988084,5988084,Male,1961,December,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1995-04-01,Primary care only,,,British
483541,5995762,5995762,Female,1959,October,1065,1154,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-07-02,Primary care only,,,British


# Find number of Meniere's cases before applying exclusion criteria

In [134]:
# This is the number of cases with Meniere's in the 500K sample of white Europeans after sample QC but before applying any exclusion criteria
df_filtered[df_filtered['IID'].isin(df_meniere)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
331,1003440,1003440,Female,1945,February,1309,1294,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-04-27,Primary care and other source(s),British
452,1004673,1004673,Female,1941,April,1465,1197,1312,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009-02-02,Primary care and other source(s),British
619,1006389,1006389,Male,1940,November,1065,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-01,Self-report only,British
1470,1015307,1015307,Female,1947,September,1421,1080,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1987-07-01,Self-report and other source(s),British
1835,1019052,1019052,Female,1951,September,1111,1265,1261,1226,1421,1457,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988-07-01,Self-report only,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485916,6020276,6020276,Female,1952,July,1065,1111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-10-25,Hospital admissions data only,British
485955,6020690,6020690,Female,1949,March,1065,1162,1421,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2005-07-01,Self-report and other source(s),British
486020,6021359,6021359,Female,1942,July,1065,1464,1473,1421,1154,1351,99999,1226,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1994-07-01,Self-report only,British
486074,6021916,6021916,Female,1944,May,1065,1093,1220,1126,1294,1286,1138,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-02-06,Hospital admissions data only,British


In [135]:
# This is the number of cases with Meniere's in the 200K sample of white Europeans after sample QC but before applying any exclusion criteria
subset_200k[subset_200k['IID'].isin(df_meniere)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,f.20002.0.19,f.20002.0.20,f.20002.0.21,f.20002.0.22,f.20002.0.23,f.20002.0.24,f.20002.0.25,f.20002.0.26,f.20002.0.27,f.20002.0.28,f.20002.0.29,f.20002.0.30,f.20002.0.31,f.20002.0.32,f.20002.0.33,f.20002.1.0,...,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131250.0.0,f.131251.0.0,f.131252.0.0,f.131253.0.0,ethnicity
452,1004673,1004673,Female,1941,April,1465,1197,1312,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2009-02-02,Primary care and other source(s),British
2491,1025878,1025878,Female,1951,September,1421,1374,1138,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001-07-01,Self-report only,British
2638,1027444,1027444,Female,1939,October,1065,1421,99999,1473,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1999-04-01,Self-report only,British
3272,1033917,1033917,Male,1940,December,1065,1094,1093,1113,1111,1465,1265,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2010-04-09,Hospital admissions data only,British
4442,1046062,1046062,Female,1948,January,1293,1201,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-01-10,Hospital admissions data only,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481124,5970836,5970836,Female,1940,July,1111,1277,1465,1421,1309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1992-11-23,Primary care and other source(s),Irish
481294,5972629,5972629,Female,1945,May,1464,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013-12-17,Hospital admissions data only,British
483466,5995002,5995002,Male,1966,April,1286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-07-01,Self-report and other source(s),British
483627,5996643,5996643,Female,1939,August,1065,1465,1458,1286,1421,1278,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1983-07-01,Self-report only,British


# Next step: decide on exclusion criteria for these samples to be able to run association analyses