# Generate the phenotypes for the hearing impairment traits from the UKBB

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Read in the data

## 1.1. Read in database

In [2]:
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270" in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271" in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002" in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000" in col]
    hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393" in col]
    hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247" in col]
    hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257" in col]
    tin_cols = [col.strip('"') for col in header if "f.4803" in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003" in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34" in col]
    month_of_birth = [col.strip('"') for col in header if "f.52" in col]

In [4]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd9_colnames + f20002_colnames + ethnicity + reported_sex + genetic_sex + hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257 + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [5]:
print(datetime.now())

2021-09-01 04:36:43.881921


In [6]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,,1969-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,


In [7]:
print(datetime.now())

2021-09-01 04:41:47.811381


## 1.2. Read in exclusion criteria for icd10, icd9, and self-report

In [8]:
# csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("~/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32,N,N,,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218,N,N,,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49,N,N,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
571,f.20002,1491 brain haemorrhage,218,Y,,,,,,,,
572,f.20002,1583 ischaemic stroke,44,N,N,,,,,,,
573,f.20002,1082 transient ischaemic attack (tia),2243,N,N,,,,,,,
574,f.20002,1083 subdural haemorrhage/haematoma,212,Y,,,,,,,,


## 1.3. Read in PCA outlier file

In [9]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("~/030821_ukb42495_exomed_white_189010ind.pheno.white_expanded_07_09_21_genoarray_projected.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1008606,1008606
1,1010412,1010412
2,1045757,1045757
3,1057699,1057699
4,1069457,1069457
...,...,...
563,4773865,4773865
564,5109700,5109700
565,5637210,5637210
566,5748329,5748329


In [10]:
df1 = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/010421_UKBB_Hearing_background_noise_f2257_175531ind_exomes", sep=" ")

In [11]:
df2 = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/hearing_impairment/080421_UKBB_Hearing_noise_f2257_expandedwhite_66656cases_98082ctrl", sep="\t")

In [12]:
missing_cases = set(df1[df1["hearing_noise_cat"] == 1]["IID"].to_list()) - set(df2["IID"].to_list())
missing_cases = set([str(x) for x in missing_cases])

In [13]:
len(missing_cases - set(df["IID"].to_list()))

0

# 2. Sample QC

## 2.1. Remove individuals that do not match for reported and genetic sex

In [14]:
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]

In [15]:
# returns true only if 
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]

In [16]:
# exclusion based on inconsistent sex
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)

In [17]:
filtered = df[~ex_sex]

In [18]:
print(sum(ex_sex), "individuals removed because of inconsistency with the genetic and reported sex variables")

233 individuals removed because of inconsistency with the genetic and reported sex variables


In [19]:
print("Of these individuals", sum([1 for x in df[genetic_sex[0]].to_list() if pd.isna(x)]), "were NA for the genetic sex variable")

Of these individuals 174 were NA for the genetic sex variable


In [20]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,,1969-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,


## 2.2. Remove non-white individuals

In [21]:
# set of answers for the ethnicity question
set(filtered[ethnicity[0]].to_list()).union( set(filtered[ethnicity[1]].to_list()) , set(filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [22]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return temp[0] # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent White"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

Set an ancestry row that combines the ancestry answers from the database into one 

In [23]:
filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)

  filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [24]:
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent White"

In [25]:
ex_non_white = filtered[["ethnicity"]].apply(find_non_white, axis=1)

In [26]:
filtered = filtered[~ex_non_white]

In [27]:
print(sum(ex_non_white), "individuals removed for being non-white")

11385 individuals removed for being non-white


In [28]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


In [29]:
filtered[filtered["ethnicity"] == "British"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


In [30]:
filtered[filtered["ethnicity"] == "Irish"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
16,1000415,1000415,Male,1942,December,No,,,,No,...,,,,,,,,,,Irish
54,1001316,1001316,Male,1964,September,No,,Yes,,No,...,,,,,,,,,,Irish
65,1001492,1001492,Male,1947,August,No,,,,Do not know,...,,,,,,,,,,Irish
94,1002152,1002152,Female,1939,June,No,,,,No,...,,,,,,,,,,Irish
117,1002729,1002729,Female,1966,September,No,,,,No,...,,,,,,,,,,Irish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200387,6020127,6020127,Male,1947,July,Do not know,,,,Yes,...,,,,,,,,,,Irish
200422,6020940,6020940,Male,1963,April,No,,,,No,...,,,,,,,,,,Irish
200506,6022857,6022857,Male,1964,November,No,,,,No,...,,,,,,,,,,Irish
200553,6023832,6023832,Male,1942,June,No,,,,No,...,,,,,,,,,,Irish


In [31]:
filtered[filtered["ethnicity"] == "White"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
361,1008841,1008841,Female,1964,October,Yes,,,,Yes,...,,,,,,,,,,White
494,1012043,1012043,Male,1943,June,Yes,,,,Yes,...,,,,,,,,,,White
727,1017312,1017312,Female,1947,March,No,,,,No,...,,,,,,,,,,White
809,1019550,1019550,Female,1942,July,No,,,,No,...,,,,,,,,,,White
3711,1091708,1091708,Female,1965,February,No,,,,Yes,...,,,,,,,,,,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197808,5955769,5955769,Male,1965,April,No,,,,No,...,,,,,,,,,,White
199023,5986358,5986358,Female,1955,May,No,,,,Yes,...,,,,,,,,,,White
199527,5998866,5998866,Male,1946,March,No,,,,No,...,,,,,,,,,,White
199632,6001050,6001050,Female,1939,May,No,,,,No,...,,,,,,,,,,White


In [32]:
filtered[filtered["ethnicity"] == "Inconsistent White"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
299,1007061,1007061,Male,1950,May,No,,No,,No,...,,,,,,,,,,Inconsistent White
2424,1059988,1059988,Male,1950,July,No,No,No,,No,...,,,,,,,,,,Inconsistent White
3154,1077383,1077383,Male,1946,October,Yes,Yes,,,Yes,...,,,,,,,,,,Inconsistent White
4362,1108028,1108028,Female,1948,January,No,No,,,No,...,,,,,,,,,,Inconsistent White
4496,1111668,1111668,Female,1948,August,No,No,No,,Yes,...,,,,,,,,,,Inconsistent White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198180,5965203,5965203,Male,1944,November,No,,No,,No,...,,,,,,,,,,Inconsistent White
198498,5973198,5973198,Female,1941,April,No,No,No,,No,...,,,,,,,,,,Inconsistent White
199194,5990529,5990529,Female,1949,August,No,No,No,,No,...,,,,,,,,,,Inconsistent White
200000,6010193,6010193,Female,1948,April,No,No,No,,Yes,...,,,,,,,,,,Inconsistent White


In [33]:
filtered[filtered["ethnicity"] == "Unknown"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
33,1000701,1000701,Female,1949,October,No,,,,No,...,,,,,,,,,,Unknown
362,1008909,1008909,Male,1944,March,No,,,,Yes,...,,,,,,,,,,Unknown
1868,1045757,1045757,Female,1957,September,,,,,,...,,,,,,,,,2013-05-31,Unknown
2167,1053449,1053449,Male,1961,August,No,,,,No,...,,,,,,,,,,Unknown
2331,1057699,1057699,Male,1942,July,No,,,,No,...,,,,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198629,5976355,5976355,Female,1947,July,Yes,,,,Yes,...,,,,,,,,,,Unknown
198870,5982497,5982497,Female,1967,December,No,,,,No,...,,,,,,,,,,Unknown
198897,5983226,5983226,Male,1951,September,Yes,,,,Yes,...,,,,,,,,,,Unknown
200239,6016238,6016238,Female,1953,February,No,,,,No,...,,,,,,,,,,Unknown


In [34]:
filtered[filtered["ethnicity"] == "Any other white background"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
12,1000331,1000331,Female,1956,December,No,,,,No,...,,,,,,,,,,Any other white background
36,1000776,1000776,Female,1946,June,No,,,,Do not know,...,,,,,,,,,,Any other white background
38,1000858,1000858,Male,1947,May,No,,,,No,...,,,,,,,,,,Any other white background
48,1001140,1001140,Female,1950,August,Do not know,,,,Yes,...,,,,,,,,,,Any other white background
78,1001825,1001825,Female,1961,February,No,,,,No,...,,,,,,,,,,Any other white background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200467,6022021,6022021,Male,1948,February,Do not know,,,,No,...,,,,,,,,,,Any other white background
200481,6022407,6022407,Female,1967,October,No,,,,No,...,,,,,,,,,,Any other white background
200496,6022641,6022641,Female,1946,December,No,,,,No,...,,,,,,,,,,Any other white background
200522,6023170,6023170,Female,1963,March,No,,,,No,...,,,,,,,,,,Any other white background


In [35]:
saved_for_number_checking = filtered

In [36]:
filtered = saved_for_number_checking

# 2. Remove PCA outliers from the full database

Remove the outlier individuals from the full database if there exists any.

In [37]:
# since the IID from the dataframe is in string the outlier ids have to be made into string as well
out_ids = [str(x) for x in outlier[0].to_list()] 

def find_outliers(row):
    return row["IID"] in out_ids

In [38]:
ex_pca_outliers = filtered[["IID", "FID"]].apply(find_outliers, axis=1)

In [39]:
filtered = filtered[~ex_pca_outliers]

In [40]:
print(sum(ex_pca_outliers), "individuals removed for being pca outliers")

568 individuals removed for being pca outliers


In [41]:
len(missing_cases - set(df["IID"].to_list()))

0

In [42]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


# 3. Filter out exclusions from the full database

If individuals have certain codes from ICD 10, ICD 9, and self-reports they must be fully removed from the analysis. 

In [43]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

## 3.1. Filter out ICD 10 exclusions

In [44]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in filtered if "f.41270" in col]

In [45]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,...,,,,,,,,,,
1,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
2,C19,C20,D037,D125,K635,L720,Z860,,,,...,,,,,,,,,,
3,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,...,,,,,,,,,,
4,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,...,,,,,,,,,,
200615,E669,I10,K801,K802,K85,M179,M199,M233,M8956,N921,...,,,,,,,,,,
200616,O074,,,,,,,,,,...,,,,,,,,,,
200617,,,,,,,,,,,...,,,,,,,,,,


In [46]:
# get rows from exclusion database that contian the codes that need to be removed for icd10
exclude_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
27,f.41270,H65.2 Chronic serous otitis media,103,Y,,,,,,,,
28,f.41270,H65.3 Chronic mucoid otitis media,960,Y,,,,,,,,
29,f.41270,H65.4 Other chronic nonsuppurative otitis media,158,Y,,,,,,,,
30,f.41270,"H65.9 Nonsuppurative otitis media, unspecified",508,Y,,,,,,,,
33,f.41270,H66.1 Chronic tubotympanic suppurative otitis ...,40,Y,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
276,f.41270,"S07.9 Crushing injury of head, part unspecified",1,Y,,,,,,,,
279,f.41270,S08.1 Traumatic amputation of ear,13,Y,,,,,,,,
280,f.41270,S08.8 Traumatic amputation of other parts of head,1,Y,,,,,,,,
281,f.41270,S08.9 Traumatic amputation of unspecified part...,1,Y,,,,,,,,


In [47]:
# get the icd10 codes that should be excluded from database
ex_critia_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd10["Phenotype"].tolist()]
ex_critia_icd10

['H652',
 'H653',
 'H654',
 'H659',
 'H661',
 'H662',
 'H663',
 'H664',
 'H669',
 'H680',
 'H701',
 'H702',
 'H708',
 'H709',
 'H71',
 'H731',
 'H738',
 'H739',
 'H740',
 'H741',
 'H742',
 'H743',
 'H748',
 'H749',
 'H750',
 'H758',
 'H800',
 'H801',
 'H802',
 'H808',
 'H809',
 'H810',
 'H830',
 'H831',
 'H832',
 'H900',
 'H901',
 'H902',
 'H910',
 'H933',
 'H940',
 'H948',
 'H950',
 'H951',
 'H958',
 'H959',
 'B020',
 'B021',
 'B022',
 'B023',
 'B027',
 'B028',
 'G000',
 'G001',
 'G002',
 'G003',
 'G008',
 'G009',
 'G01',
 'G020',
 'G021',
 'G028',
 'G030',
 'G031',
 'G032',
 'G038',
 'G039',
 'G040',
 'G041',
 'G042',
 'G048',
 'G049',
 'G050',
 'G051',
 'G052',
 'G058',
 'G060',
 'G061',
 'G062',
 'G07',
 'G08',
 'G09',
 'G510',
 'G511',
 'G512',
 'G513',
 'G514',
 'G518',
 'G519',
 'S0200',
 'S0201',
 'S0210',
 'S0211',
 'S0240',
 'S0241',
 'S0260',
 'S0261',
 'S0270',
 'S0271',
 'S0280',
 'S0281',
 'S0290',
 'S0291',
 'S045',
 'S046',
 'S049',
 'S0600',
 'S0601',
 'S0610',
 'S0611

In [48]:
# collect the individuals that should be excluded because of icd10
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [49]:
# remove them from the working database (which is now filtered. df remains unchanged)
filtered = filtered[~ex_10]

In [50]:
print(sum(ex_10), "individuals removed because of icd10 codes")

5043 individuals removed because of icd10 codes


In [51]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
5,1000210,1000210,Male,1941,October,Do not know,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


In [52]:
len(missing_cases - set(filtered["IID"].to_list()))

221

## 3.2. Filter out ICD 9 exclusions

In [53]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [54]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,3899,4781,8131,E8860,V540,,,,,,...,,,,,,,,,,
200615,,,,,,,,,,,...,,,,,,,,,,
200616,,,,,,,,,,,...,,,,,,,,,,
200617,,,,,,,,,,,...,,,,,,,,,,


In [55]:
# get rows from exclusion database that contian the codes that need to be removed for icd9
exclude_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
324,f.41271,3811 Chronic serous otitis media,8,Y,,,,,,,,
325,f.41271,3812 Chronic mucoid otitis media,11,Y,,,,,,,,
326,f.41271,3813 Other and unspecified chronic nonsuppurat...,3,Y,,,,,,,,
327,f.41271,"3814 Nonsuppurative otitis media, not specifie...",19,Y,,,,,,,,
328,f.41271,3815 Eustachian salpingitis,0,Y,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
521,f.41271,9050 Late effect of fracture of skull and face...,19,Y,,,,,,,,
531,f.41271,"9259 Crushing injury of face, scalp and neck",2,Y,,,,,,,,
537,f.41271,9514 Injury to facial nerve,0,Y,,,,,,,,
538,f.41271,9515 Injury to acoustic nerve,1,Y,,,,,,,,


In [56]:
# get the icd9 codes that should be excluded from the working database
ex_critia_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd9["Phenotype"].tolist()]
ex_critia_icd9

['3811',
 '3812',
 '3813',
 '3814',
 '3815',
 '3816',
 '3819',
 '3821',
 '3822',
 '3823',
 '3824',
 '3829',
 '3831',
 '3832',
 '3833',
 '3838',
 '3839',
 '3841',
 '3850',
 '3851',
 '3852',
 '3853',
 '3858',
 '3859',
 '3860',
 '3863',
 '3864',
 '3865',
 '3868',
 '3869',
 '3870',
 '3871',
 '3872',
 '3878',
 '3879',
 '3885',
 '3890',
 '0530',
 '0531',
 '0532',
 '0537',
 '0538',
 '3200',
 '3201',
 '3202',
 '3203',
 '3204',
 '3205',
 '3207',
 '3208',
 '3209',
 '3210',
 '3211',
 '3212',
 '3213',
 '3214',
 '3215',
 '3216',
 '3217',
 '3218',
 '3220',
 '3221',
 '3222',
 '3229',
 '3230',
 '3231',
 '3232',
 '3233',
 '3234',
 '3235',
 '3236',
 '3237',
 '3238',
 '3239',
 '3240',
 '3241',
 '3249',
 '3259',
 '3269',
 '3510',
 '3511',
 '3518',
 '3519',
 '8000',
 '8001',
 '8002',
 '8003',
 '8010',
 '8011',
 '8012',
 '8013',
 '8022',
 '8023',
 '8024',
 '8025',
 '8028',
 '8029',
 '8030',
 '8031',
 '8032',
 '8033',
 '8040',
 '8041',
 '8042',
 '8043',
 '8509',
 '8510',
 '8511',
 '8520',
 '8521',
 '8530',
 

In [57]:
# collect the individuals that should be excluded because of icd9
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [58]:
# remove them from the working database
filtered = filtered[~ex_9]

In [59]:
print(sum(ex_9), "individuals removed because of icd9 codes")

243 individuals removed because of icd9 codes


In [60]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
5,1000210,1000210,Male,1941,October,Do not know,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


In [61]:
len(missing_cases - set(filtered["IID"].to_list()))

227

## 3.3. Filter out f.20002 exclusions

In [62]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = [col for col in filtered if "f.20002" in col]

In [63]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
1,1396,1473,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1075,1440,1473,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1065,1123,1286,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,,,,,,,,,,,...,,,,,,,,,,
200615,1065,,,,,,,,,,...,,,,,,,,,,
200616,1452,1265,1387,,,,,,,,...,,,,,,,,,,
200617,,,,,,,,,,,...,,,,,,,,,,


In [64]:
# get rows from exclusion database that contian the codes that need to be removed for self-report
exclude_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
544,f.20002,1420 otosclerosis,260,Y,,,,,,,,
545,f.20002,1421 meniere's disease,1553,Y,,,,,,,,
546,f.20002,1499 labyrinthitis,417,Y,,,,,,,,
550,f.20002,1244 infection of nervous system,55,Y,,,,,,,,
551,f.20002,1245 brain abscess/intracranial abscess,79,Y,,,,,,,,
552,f.20002,1246 encephalitis,348,Y,,,,,,,,
553,f.20002,1247 meningitis,2214,Y,,,,,,,,
555,f.20002,1249 cranial nerve problem/palsy,289,Y,,,,,,,,
556,f.20002,1250 bell's palsy/facial nerve palsy,591,Y,,,,,,,,
558,f.20002,1240 neurological injury/trauma,130,Y,,,,,,,,


In [65]:
# get the self-report codes that should be excluded from the working database
ex_critia_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_f20002["Phenotype"].tolist()]
ex_critia_f20002

['1420',
 '1421',
 '1499',
 '1244',
 '1245',
 '1246',
 '1247',
 '1249',
 '1250',
 '1240',
 '1626',
 '1086',
 '1491',
 '1083',
 '1425']

In [66]:
# collect the individuals that should be excluded because of self-report
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [67]:
# remove them from the working database
filtered = filtered[~ex_f20002]

In [68]:
print(sum(ex_f20002), "individuals removed because of self-reported codes")

2333 individuals removed because of self-reported codes


In [69]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,,British
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,,British
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,,1969-01-01,British
5,1000210,1000210,Male,1941,October,Do not know,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,,British
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,,British
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,,British
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,,British


In [70]:
len(missing_cases - set(filtered["IID"].to_list()))

384

# 4. Identify Sex Column

In [71]:
# male is denoted a 0, female as 1
def find_sex(row):
    if row["f.31.0.0"] == "Male":
        return 0
    return 1

sex = filtered[["f.31.0.0"]].apply(find_sex, axis=1)
sex

1         0
2         1
3         0
4         1
5         0
         ..
200614    0
200615    1
200616    1
200617    0
200618    1
Length: 180814, dtype: int64

In [72]:
filtered["sex"] = sex

  filtered["sex"] = sex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [73]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
1,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,British,0
2,1000078,1000078,Female,1955,June,No,No,No,,No,...,,,,,,,,,British,1
3,1000081,1000081,Male,1942,February,No,,,,No,...,,,,,,,,,British,0
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,...,,,,,,,,1969-01-01,British,1
5,1000210,1000210,Male,1941,October,Do not know,,,,No,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,...,,,,,,,,,British,0
200615,6025319,6025319,Female,1953,March,No,,,,No,...,,,,,,,,,British,1
200616,6025346,6025346,Female,1954,October,No,,,,No,...,,,,,,,,,British,1
200617,6025363,6025363,Male,1944,April,No,,,,No,...,,,,,,,,,British,0


In [74]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [75]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
0,1,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
1,2,1000078,1000078,Female,1955,June,No,No,No,,...,,,,,,,,,British,1
2,3,1000081,1000081,Male,1942,February,No,,,,...,,,,,,,,,British,0
3,4,1000198,1000198,Female,1967,July,Yes,,,,...,,,,,,,,1969-01-01,British,1
4,5,1000210,1000210,Male,1941,October,Do not know,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,200614,6025295,6025295,Male,1961,April,No,,,,...,,,,,,,,,British,0
180810,200615,6025319,6025319,Female,1953,March,No,,,,...,,,,,,,,,British,1
180811,200616,6025346,6025346,Female,1954,October,No,,,,...,,,,,,,,,British,1
180812,200617,6025363,6025363,Male,1944,April,No,,,,...,,,,,,,,,British,0


In [76]:
saved_filtered = filtered

In [77]:
filtered = saved_filtered

In [78]:
filtered[filtered["ethnicity"] == "British"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
0,1,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
1,2,1000078,1000078,Female,1955,June,No,No,No,,...,,,,,,,,,British,1
2,3,1000081,1000081,Male,1942,February,No,,,,...,,,,,,,,,British,0
3,4,1000198,1000198,Female,1967,July,Yes,,,,...,,,,,,,,1969-01-01,British,1
4,5,1000210,1000210,Male,1941,October,Do not know,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,200614,6025295,6025295,Male,1961,April,No,,,,...,,,,,,,,,British,0
180810,200615,6025319,6025319,Female,1953,March,No,,,,...,,,,,,,,,British,1
180811,200616,6025346,6025346,Female,1954,October,No,,,,...,,,,,,,,,British,1
180812,200617,6025363,6025363,Male,1944,April,No,,,,...,,,,,,,,,British,0


In [79]:
filtered[filtered["ethnicity"] == "Irish"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
13,16,1000415,1000415,Male,1942,December,No,,,,...,,,,,,,,,Irish,0
47,54,1001316,1001316,Male,1964,September,No,,Yes,,...,,,,,,,,,Irish,0
57,65,1001492,1001492,Male,1947,August,No,,,,...,,,,,,,,,Irish,0
84,94,1002152,1002152,Female,1939,June,No,,,,...,,,,,,,,,Irish,1
103,117,1002729,1002729,Female,1966,September,No,,,,...,,,,,,,,,Irish,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180601,200381,6020017,6020017,Male,1938,May,No,,,,...,,,,,,,,,Irish,0
180607,200387,6020127,6020127,Male,1947,July,Do not know,,,,...,,,,,,,,,Irish,0
180639,200422,6020940,6020940,Male,1963,April,No,,,,...,,,,,,,,,Irish,0
180715,200506,6022857,6022857,Male,1964,November,No,,,,...,,,,,,,,,Irish,0


In [80]:
filtered[filtered["ethnicity"] == "White"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
322,361,1008841,1008841,Female,1964,October,Yes,,,,...,,,,,,,,,White,1
658,727,1017312,1017312,Female,1947,March,No,,,,...,,,,,,,,,White,1
734,809,1019550,1019550,Female,1942,July,No,,,,...,,,,,,,,,White,1
3344,3711,1091708,1091708,Female,1965,February,No,,,,...,,,,,,,,,White,1
3903,4326,1107230,1107230,Male,1969,June,No,,,,...,,,,,,,,,White,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178290,197808,5955769,5955769,Male,1965,April,No,,,,...,,,,,,,,,White,0
179373,199023,5986358,5986358,Female,1955,May,No,,,,...,,,,,,,,,White,1
179830,199527,5998866,5998866,Male,1946,March,No,,,,...,,,,,,,,,White,0
179924,199632,6001050,6001050,Female,1939,May,No,,,,...,,,,,,,,,White,1


In [81]:
filtered[filtered["ethnicity"] == "Inconsistent White"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
266,299,1007061,1007061,Male,1950,May,No,,No,,...,,,,,,,,,Inconsistent White,0
2192,2424,1059988,1059988,Male,1950,July,No,No,No,,...,,,,,,,,,Inconsistent White,0
3931,4362,1108028,1108028,Female,1948,January,No,No,,,...,,,,,,,,,Inconsistent White,1
4812,5345,1133245,1133245,Male,1952,December,No,No,,,...,,,,,,,,,Inconsistent White,0
5049,5605,1139512,1139512,Male,1955,February,No,No,No,,...,,,,,,,,,Inconsistent White,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178624,198179,5965153,5965153,Male,1959,November,,Yes,No,,...,,,,,,,,,Inconsistent White,0
178625,198180,5965203,5965203,Male,1944,November,No,,No,,...,,,,,,,,,Inconsistent White,0
178896,198498,5973198,5973198,Female,1941,April,No,No,No,,...,,,,,,,,,Inconsistent White,1
179520,199194,5990529,5990529,Female,1949,August,No,No,No,,...,,,,,,,,,Inconsistent White,1


In [82]:
filtered[filtered["ethnicity"] == "Unknown"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
27,33,1000701,1000701,Female,1949,October,No,,,,...,,,,,,,,,Unknown,1
323,362,1008909,1008909,Male,1944,March,No,,,,...,,,,,,,,,Unknown,0
1962,2167,1053449,1053449,Male,1961,August,No,,,,...,,,,,,,,,Unknown,0
2994,3315,1081614,1081614,Female,1960,July,No,,,,...,,,,,,,,,Unknown,1
3086,3416,1084388,1084388,Female,1957,May,No,,,,...,,,,,,,,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177554,197021,5934946,5934946,Male,1963,April,Do not know,,,,...,,,,,,,,,Unknown,0
178810,198396,5970565,5970565,Female,1945,February,No,,,,...,,,,,,,,,Unknown,1
179256,198897,5983226,5983226,Male,1951,September,Yes,,,,...,,,,,,,,,Unknown,0
180472,200239,6016238,6016238,Female,1953,February,No,,,,...,,,,,,,,,Unknown,1


In [83]:
filtered[filtered["ethnicity"] == "Any other white background"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
10,12,1000331,1000331,Female,1956,December,No,,,,...,,,,,,,,,Any other white background,1
30,36,1000776,1000776,Female,1946,June,No,,,,...,,,,,,,,,Any other white background,1
32,38,1000858,1000858,Male,1947,May,No,,,,...,,,,,,,,,Any other white background,0
41,48,1001140,1001140,Female,1950,August,Do not know,,,,...,,,,,,,,,Any other white background,1
70,78,1001825,1001825,Female,1961,February,No,,,,...,,,,,,,,,Any other white background,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180680,200467,6022021,6022021,Male,1948,February,Do not know,,,,...,,,,,,,,,Any other white background,0
180693,200481,6022407,6022407,Female,1967,October,No,,,,...,,,,,,,,,Any other white background,1
180706,200496,6022641,6022641,Female,1946,December,No,,,,...,,,,,,,,,Any other white background,1
180729,200522,6023170,6023170,Female,1963,March,No,,,,...,,,,,,,,,Any other white background,1


# 5. f.3393, f.2247, f.2257, and Mendelian

## 5.1. Remove inconsistencies or unclear individuals

Some individuals might be unclear on if they do or do not have hearing difficulties or are inconsistent (found in f.3393, f.2247, and f.2257), in which case they cannot be considered either controls or cases and must be removed.

The conditions for being removed are as follows:
* Saying I don't know after saying either yes or no
* Only saying I don't know or prefer not to say
* Being completely deaf

### 5.1.1. Prior to filtering for inconsistencies

<b>Hearing difficulty/problems with background noise</b> <br>
f.2257 = {'Yes': 81218, NA : 513774, 'No': 131091, 'Do not know': 4409, 'Prefer not to answer': 208}

<b>Hearing difficult/problems</b><br>
f.2247 = {'No': 151758, : 513806, 'Yes': 55437, 'Do not know': 9489, 'Prefer not to answer': 171, 'I am completely deaf': 39}

<b>Hearing aid user</b><br>
f.3393 = {'No': 145486, : 577795, 'Yes': 7237, 'Prefer not to answer': 182}

### 5.1.2. Setup for inconsistency filtering

In [84]:
# collect all the columns 
# redefining here for clarity

hearing_imp_f3393 = [col for col in filtered if "f.3393" in col]
hearing_imp_f2247 = [col for col in filtered if "f.2247" in col]
hearing_imp_f2257 = [col for col in filtered if "f.2257" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [85]:
# for hearing impairement questions we code the answers with the following for comparison
hearing_ans = {"Do not know":9, "Yes":1, "No":0}

# will contain the complete set of actual combinations of answers from individuals in the database
options = set()

# pass one pheno at a time to this function
def find_options(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    options.add(answer)

In [86]:
# this builds the options set to contain a set of all the unique answers the individuals in the database have had
# over their hearing impairment questions
hearing_imp_qs = filtered[hearing_imp_f3393]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2247]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2257]
s = hearing_imp_qs.apply(find_options, axis=1)

In [87]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '0009',
 '001',
 '0010',
 '0011',
 '0019',
 '009',
 '0090',
 '0091',
 '0099',
 '01',
 '010',
 '0100',
 '0101',
 '011',
 '0110',
 '0111',
 '0119',
 '019',
 '0191',
 '0199',
 '09',
 '090',
 '0900',
 '0901',
 '0909',
 '091',
 '0910',
 '0911',
 '0919',
 '099',
 '0990',
 '0991',
 '1',
 '10',
 '100',
 '1000',
 '1001',
 '1009',
 '101',
 '1010',
 '1011',
 '109',
 '1090',
 '1099',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '1119',
 '119',
 '1190',
 '1191',
 '19',
 '190',
 '1900',
 '1901',
 '191',
 '1911',
 '199',
 '9',
 '90',
 '900',
 '9000',
 '901',
 '909',
 '9090',
 '91',
 '910',
 '911',
 '9110',
 '9111',
 '919',
 '99',
 '990',
 '991',
 '9911',
 '999',
 '9999'}

In [88]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removingthe answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [89]:
inconsistent

['9090',
 '0990',
 '0900',
 '09',
 '0099',
 '0090',
 '099',
 '009',
 '909',
 '090',
 '0009',
 '0909',
 '199',
 '1191',
 '1911',
 '19',
 '1119',
 '191',
 '919',
 '9911',
 '119',
 '0110',
 '10',
 '1100',
 '1000',
 '101',
 '0010',
 '1011',
 '1101',
 '0100',
 '1010',
 '010',
 '100',
 '0101',
 '110',
 '1001',
 '1110',
 '109',
 '9110',
 '910',
 '0019',
 '0191',
 '190',
 '0910',
 '0119',
 '0919',
 '1900',
 '019',
 '1901',
 '0901',
 '1099',
 '1090',
 '0199',
 '1190',
 '1009']

### 5.1.3. Filtering out the data

In [90]:
# return True if you find an individual that has NA for every answer in each column
def find_empty(row):
    for i in row:
        if not pd.isna(i):
            return False
    return True

In [91]:
# return True if an individual only answers Do not know but never Yes or No
def find_dont_know(row):
    temp = []
    for i in row:
        if not pd.isna(i):
            temp.append(i)
        
    if "Do not know" in temp and "Yes" not in temp and "No" not in temp:
        return True
    return False

In [92]:
# will return true if that row should be removed
# pass one pheno at a time through the function
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

# if we have individuals that either don't answer or prefer not to say only then we cancel them out
# return true if all the rows have no definitive answers
def find_all_none(row):
    for i in row:
        if not pd.isna(i) and (i == "Yes" or i == "No"):
            return False
    return True

In [93]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
0,1,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
1,2,1000078,1000078,Female,1955,June,No,No,No,,...,,,,,,,,,British,1
2,3,1000081,1000081,Male,1942,February,No,,,,...,,,,,,,,,British,0
3,4,1000198,1000198,Female,1967,July,Yes,,,,...,,,,,,,,1969-01-01,British,1
4,5,1000210,1000210,Male,1941,October,Do not know,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,200614,6025295,6025295,Male,1961,April,No,,,,...,,,,,,,,,British,0
180810,200615,6025319,6025319,Female,1953,March,No,,,,...,,,,,,,,,British,1
180811,200616,6025346,6025346,Female,1954,October,No,,,,...,,,,,,,,,British,1
180812,200617,6025363,6025363,Male,1944,April,No,,,,...,,,,,,,,,British,0


In [94]:
# filter out inconsistencies for f3393
hearing_imp_qs = filtered[hearing_imp_f3393]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [95]:
len(missing_cases - set(filtered["IID"].to_list()))

450

In [96]:
# filter out inconsistencies for f2247
hearing_imp_qs = filtered[hearing_imp_f2247]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [97]:
len(missing_cases - set(filtered["IID"].to_list()))

1800

In [98]:
# filter out inconsistencies for f2257
hearing_imp_qs = filtered[hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [99]:
len(missing_cases - set(filtered["IID"].to_list()))

1801

In [100]:
# filter out individuals that don't have a definiteive answer for any hearing aid questions
hearing_imp_qs = filtered[hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_all_none, axis=1)
filtered = filtered[~exclude]

In [101]:
len(missing_cases - set(filtered["IID"].to_list()))

1801

In [102]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex
0,1,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
1,2,1000078,1000078,Female,1955,June,No,No,No,,...,,,,,,,,,British,1
2,3,1000081,1000081,Male,1942,February,No,,,,...,,,,,,,,,British,0
3,4,1000198,1000198,Female,1967,July,Yes,,,,...,,,,,,,,1969-01-01,British,1
4,5,1000210,1000210,Male,1941,October,Do not know,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,200614,6025295,6025295,Male,1961,April,No,,,,...,,,,,,,,,British,0
180810,200615,6025319,6025319,Female,1953,March,No,,,,...,,,,,,,,,British,1
180811,200616,6025346,6025346,Female,1954,October,No,,,,...,,,,,,,,,British,1
180812,200617,6025363,6025363,Male,1944,April,No,,,,...,,,,,,,,,British,0


In [103]:
saved_2_filtered = filtered

In [104]:
filtered = saved_2_filtered

## 5.2. Identify Pure Controls

Need to make sure that for f.3393, f.2247, and f.2257 we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all f.3393, f.2247, and f.2257). However these individuals can still be part of the cases

In [105]:
# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return 0
    return 1

# returns 0 if it's a ctrl or else 1
# this is specific for f3393 because f3393 was only asked under certain circumstances
def find_ctrl_or_NA(row):
    for i in row:
        if not pd.isna(i) and i != "No" and i != "Prefer not to answer": # if we have any answers that are not NA or No only then we don't have a ctrl
            return 1
    return 0


In [106]:
# filter through the hearing impairment questions to find the controls
hearing_imp_qs = filtered[hearing_imp_f3393]
f3393_ctrl = hearing_imp_qs.apply(find_ctrl_or_NA, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2247]
f2247_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2257]
f2257_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()

In [107]:
pure_ctrl = [0 if i == 0 and f2247_ctrl[en] == 0 and f2257_ctrl[en] == 0 else 1 for en, i in enumerate(f3393_ctrl)]

In [108]:
print(len(pure_ctrl) - sum(pure_ctrl), "individuals are controls prior to filtration for icd10, icd9 and self-reported codes")

97384 individuals are controls prior to filtration for icd10, icd9 and self-reported codes


### 5.2.1. Collect ICD 10 codes to filter out from Ctrl

In [109]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24,N,Y,,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51,N,Y,,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33,N,Y,,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408,N,Y,,,,,N,,


In [110]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [111]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
1,C19,C20,D037,D125,K635,L720,Z860,,,,...,,,,,,,,,,
2,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,...,,,,,,,,,,
3,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,...,,,,,,,,,,
4,C73,C780,D70,E780,F412,G473,G479,G620,I48,I828,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,...,,,,,,,,,,
180810,E669,I10,K801,K802,K85,M179,M199,M233,M8956,N921,...,,,,,,,,,,
180811,O074,,,,,,,,,,...,,,,,,,,,,
180812,,,,,,,,,,,...,,,,,,,,,,


In [112]:
# collect the individuals who should not be part of controls because of icd 10 codes
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### 5.2.2. Collect ICD 9 codes to filter out from Ctrl

In [113]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
376,f.41271,3880 Degenerative and vascular disorders of ear,0,N,Y,,,,,,,
377,f.41271,3881 Noise effects on inner ear,0,N,Y,,,,,,,
378,f.41271,"3882 Sudden hearing loss, unspecified",0,N,Y,,,,,,,
380,f.41271,3884 Other abnormal auditory perception,0,N,Y,,,,,,,
384,f.41271,3888 Other specified disorders of ear,1,N,Y,,,,,,,
385,f.41271,"3889 Disorders of ear, unspecified",2,N,Y,,,,,,,
388,f.41271,3891 Sensorineural deafness,6,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
389,f.41271,3892 Mixed conductive and sensorineural deafness,1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
390,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,
391,f.41271,3898 Other specified forms of deafness,0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,,Y,,


In [114]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [115]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,3899,4781,8131,E8860,V540,,,,,,...,,,,,,,,,,
180810,,,,,,,,,,,...,,,,,,,,,,
180811,,,,,,,,,,,...,,,,,,,,,,
180812,,,,,,,,,,,...,,,,,,,,,,


In [116]:
# collect the individuals who should not be part of controls because of icd 9 codes
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### 5.2.3. Collect f20002 codes to filter out from Ctrl

In [117]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_f4803,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations


In [118]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1396,1473,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,1075,1440,1473,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1065,1123,1286,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,,,,,,,,,,,...,,,,,,,,,,
180810,1065,,,,,,,,,,...,,,,,,,,,,
180811,1452,1265,1387,,,,,,,,...,,,,,,,,,,
180812,,,,,,,,,,,...,,,,,,,,,,


In [119]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

[]

In [120]:
# collect the individuals who should not be part of controls because of self-reported codes
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### 5.2.4. Filter out the HI Ctrl

In [121]:
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

In [122]:
# because individuals that are controls are labeled as 0
# temp says True if an individual is not a control and False if it is a control
# ex_10, ex_9, and ex_f20002 are True for individuals that are not controls and False for individuals that are controls
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

# we set the control as 0 for each individual that is False in temp
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [123]:
print(len(filtered_ctrl) - sum(filtered_ctrl), "individuals are controls after addition filtration for icd10, icd9 and self-reported codes")

96837 individuals are controls after addition filtration for icd10, icd9 and self-reported codes


In [124]:
filtered["hearing_imp_pure_ctrl"] = filtered_ctrl

  filtered["hearing_imp_pure_ctrl"] = filtered_ctrl


In [125]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46,f.131258.0.0,ethnicity,sex,hearing_imp_pure_ctrl
0,1,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,British,0,1
1,2,1000078,1000078,Female,1955,June,No,No,No,,...,,,,,,,,British,1,0
2,3,1000081,1000081,Male,1942,February,No,,,,...,,,,,,,,British,0,0
3,4,1000198,1000198,Female,1967,July,Yes,,,,...,,,,,,,1969-01-01,British,1,1
4,5,1000210,1000210,Male,1941,October,Do not know,,,,...,,,,,,,,British,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180809,200614,6025295,6025295,Male,1961,April,No,,,,...,,,,,,,,British,0,1
180810,200615,6025319,6025319,Female,1953,March,No,,,,...,,,,,,,,British,1,0
180811,200616,6025346,6025346,Female,1954,October,No,,,,...,,,,,,,,British,1,0
180812,200617,6025363,6025363,Male,1944,April,No,,,,...,,,,,,,,British,0,0


In [126]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [127]:
saved_3_filtered = filtered

In [128]:
filtered = saved_3_filtered

## 5.3. Identify All Age and Phenotype Columns

In [129]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [130]:
ages_f131258_col = [col.strip('"') for col in header if 'f.131258' in col]
ages_f131258_col

['f.131258.0.0']

In [131]:
filtered[ages_f21003_col]

Unnamed: 0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
0,63,,,
1,52,57,60,
2,67,,,
3,41,,,
4,66,,,
...,...,...,...,...
175080,46,,,
175081,56,,,
175082,53,,,
175083,64,,,


In [132]:
filtered[ages_f131258_col]

Unnamed: 0,f.131258.0.0
0,
1,
2,
3,1969-01-01
4,
...,...
175080,
175081,
175082,
175083,


In [133]:
# get the latest time that an individual said no to any of the phenotypes
# return the oldest age that they were
def get_ctrl_age(row):
    phens = [hearing_imp_f3393, hearing_imp_f2247, hearing_imp_f2257]
    ages = []
    if row["hearing_imp_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i == "No":
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

# get the earliest time that an individual said yes to having a phenotype
def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[ages_f21003_col][en]
    else:
        return pd.NA
    
# return the minium age in the row, or else return NA
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA
    

In [134]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i == "Yes":
            return 1
    return 0

# return 1 if we have a match for the mendelian traits and have at least one of the hearing phenotypes
def find_medelian_like(row):
    mendelian_icd10 = ["H903", "H905", "H906", "H908", "H913", "H918", "H919"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]
    
    for i in row[icd_10_cols]:
        if not pd.isna(i) and i in mendelian_icd10:
            return 1
    for i in row[icd_9_cols]:
        if not pd.isna(i) and i in mendelian_icd9:
            return 1
    return 0

# return a 0 if the individual is not a case and 1 if they are a case
def find_exclusions(row):
    mendelian_icd10 = ["H903", "H904", "H905", "H906", "H907", "H908"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]
    
    try:
        if 1 == row[0]: # the first column will be one of the phenotypes, "f3393", "f2247", or "f2257"
            for i in row[icd_10_cols]:
                if not pd.isna(i) and i in mendelian_icd10 and int(row[1]) <= 55: # row[1] must be the age of the phenotype
                    return 0
            for i in row[icd_9_cols]:
                if not pd.isna(i) and i in mendelian_icd9 and int(row[1]) <= 55:
                    return 0
        return int(row[0])
    except:
        print(row.name)

# return 1 if we have a match for the other cases of f3393 or originally had f3393
def find_f3393_other_cases(row):
    icd10 = ["Z461", "Z974"]
    icd9 = ["V412", "V532"]
    if 0 == int(row["f3393"]):
        for i in row[icd_10_cols]:
            if not pd.isna(i) and i in icd10:
                return 1
        for i in row[icd_9_cols]:
            if not pd.isna(i) and i in icd9:
                return 1
    return int(row["f3393"])

# check if the h919 code exists in the individual
def check_code(row):
    for i in row:
        if not pd.isna(i) and i == "H919":
            return 1
    return 0

In [149]:
hearing_imp_qs = filtered[hearing_imp_f3393]
filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)

In [150]:
filtered[(filtered["f3393"] == 1) & (pd.isna(filtered["f3393_age"])) & (pd.isna(filtered[ages_f131258_col[0]]))][["f3393", "f3393_age"] + ages_f131258_col]

Unnamed: 0,f3393,f3393_age,f.131258.0.0


In [151]:
filtered["f3393"] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)

In [152]:
filtered[(filtered["f3393"] == 1) & (pd.isna(filtered["f3393_age"])) & (pd.isna(filtered[ages_f131258_col[0]]))][["f3393", "f3393_age"] + ages_f131258_col]

Unnamed: 0,f3393,f3393_age,f.131258.0.0


In [153]:
filtered["f3393"] = filtered[["f3393"] + icd_10_cols + icd_9_cols].apply(find_f3393_other_cases, axis=1)

In [154]:
filtered[(filtered["f3393"] == 1) & (pd.isna(filtered["f3393_age"])) & (pd.isna(filtered[ages_f131258_col[0]]))][["f3393", "f3393_age"] + ages_f131258_col]

Unnamed: 0,f3393,f3393_age,f.131258.0.0
217,1,,
345,1,,
691,1,,
1949,1,,
3056,1,,
...,...,...,...
170900,1,,
171118,1,,
173374,1,,
173415,1,,


In [137]:
hearing_imp_qs = filtered[hearing_imp_f2247]
filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered["f2247"] = filtered[["f2247", "f2247_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)

  filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)


In [138]:
filtered[(filtered["f2247"] == 1) & (pd.isna(filtered["f2247_age"]))][["f2247", "f2247_age"]]

Unnamed: 0,f2247,f2247_age


In [139]:
hearing_imp_qs = filtered[hearing_imp_f2257]
filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered["f2257"] = filtered[["f2257", "f2257_age"] + icd_10_cols + icd_9_cols].apply(find_exclusions, axis=1)

  filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)


In [140]:
filtered[(filtered["f2257"] == 1) & (pd.isna(filtered["f2257_age"]))][["f2257", "f2257_age"]]

Unnamed: 0,f2257,f2257_age


In [141]:
filtered["mendelian_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)
filtered["mendelian"] = filtered[icd_10_cols + icd_9_cols + ["f3393", "f2247", "f2257"]].apply(find_medelian_like, axis=1)

  filtered["mendelian_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)
  filtered["mendelian"] = filtered[icd_10_cols + icd_9_cols + ["f3393", "f2247", "f2257"]].apply(find_medelian_like, axis=1)


In [142]:
filtered[(filtered["mendelian"] == 1) & (pd.isna(filtered["mendelian_age"]))][["mendelian", "mendelian_age"]]

Unnamed: 0,mendelian,mendelian_age


In [143]:
filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)

  filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)


In [144]:
filtered[(filtered["hearing_imp_pure_ctrl"] == 0) & (pd.isna(filtered["ctrl_age"]))][["hearing_imp_pure_ctrl", "ctrl_age"]]

Unnamed: 0,hearing_imp_pure_ctrl,ctrl_age


In [145]:
filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)

  filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
  filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)


In [146]:
filtered[(filtered["f2247_f2257"] == 1) & (pd.isna(filtered["f2247_f2257_age"]))][["f2247_f2257", "f2247_f2257_age"]]

Unnamed: 0,f2247_f2257,f2247_f2257_age


## 5.4. File Output

In [None]:
filtered

In [None]:
filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "sex", "hearing_imp_pure_ctrl", "ctrl_age"]].to_csv("pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f3393"] == 1][["FID", "IID", "sex", "f3393", "f3393_age"]]

In [None]:
filtered[filtered["f3393"] == 1][["FID", "IID", "sex", "f3393", "f3393_age"]].to_csv("f3393_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2247"] == 1][["FID", "IID", "sex", "f2247", "f2247_age"]]

In [None]:
filtered[filtered["f2247"] == 1][["FID", "IID", "sex", "f2247", "f2247_age"]].to_csv("f2247_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2257"] == 1][["FID", "IID", "sex", "f2257", "f2257_age"]]

In [None]:
filtered[filtered["f2257"] == 1][["FID", "IID", "sex", "f2257", "f2257_age"]].to_csv("f2257_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2247_f2257"] == 1][["FID", "IID", "sex", "f2247_f2257", "f2247_f2257_age"]]

In [None]:
filtered[filtered["f2247_f2257"] == 1][["FID", "IID", "sex", "f2247_f2257", "f2247_f2257_age"]].to_csv("f2247_f2257_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["mendelian"] == 1][["FID", "IID", "sex", "mendelian", "mendelian_age"]]

In [None]:
filtered[filtered["mendelian"] == 1][["FID", "IID", "sex", "mendelian", "mendelian_age"]].to_csv("mendelian_pheno_file.tsv", sep='\t', index=False)

# 6. Tinnitus

## 6.1. Remove inconsistencies or unclear individuals

### 6.1.1. Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': 76141,
 'Yes, but not now, but have in the past': 11400,
 'Yes, now some of the time': 9788,
 'Yes, now a lot of the time': 2973,
 'Yes, now most or all of the time': 7426,
 'Do not know': 1745,
 'Prefer not to answer': 127}

### 6.1.2. Inconsistencies in the tinnitus answers

In [None]:
filtered = saved_filtered

In [None]:
tin_cols = [col for col in filtered if "f.4803" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [None]:
s = filtered[tin_cols].apply(find_options, axis=1)

In [None]:
options

In [None]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removing the answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [None]:
inconsistent

#### 6.1.2.1. Filtering out the data

In [None]:
filtered

In [None]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [None]:
exclude = filtered[tin_cols].apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [None]:
filtered

## 6.2. Identify Pure Control

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all tinnitus). However these individuals can still be part of the cases

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return False
    return True

In [None]:
f4803_ctrl = filtered[tin_cols].apply(find_ctrl, axis=1)

In [None]:
sum(f4803_ctrl)

### 6.2.1. Collect ICD 10 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

In [None]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

In [None]:
icd10 = filtered[icd10_colnames]
icd10

In [None]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### 6.2.2. Collect ICD 9 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

In [None]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

In [None]:
icd9 = filtered[icd9_colnames]
icd9

In [None]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### 6.2.3. Collect f20002 codes to filter out from Ctrl

In [None]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

In [None]:
f20002 = filtered[f20002_colnames]
f20002

In [None]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

In [None]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### 6.2.4. Collect individuals with other tinnitus codes to filter out from CTRL

In [None]:
# check if the given code exists in the individuals
def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [None]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered[icd_10_cols].apply(tinn_icd10_check_code, axis = 1)

In [None]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered[icd_9_cols].apply(tinn_icd9_check_code, axis = 1)

In [None]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered[self_report_cols].apply(tinn_self_report_check_code, axis = 1)

### 6.2.5. Filter out Tinnitus Ctrl

In [None]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [None]:
filtered["tinnitus_pure_ctrl"] = filtered_ctrl

In [None]:
filtered

## 6.3. Identify Age

In [None]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

In [None]:
#get the minimum age of each individual in the given columns
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA

In [None]:
filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)

In [None]:
saved_tinn = filtered

In [None]:
filtered = saved_tinn

## 6.4. Noisy workplace and Loud Music Variable

Two variables that we need to control for in the analysis are f.4825 (noisy workplace) and f.4836 (loud music).

### 6.4.1. Check for inconsistencies

<b>f.4825 "Have you ever worked in a noisy place where you had to shout to be heard?"</b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

<b>f.4836 "Have you ever listened to music for more than 3 hours per week at a volume which you would need to shout to be heard or, if wearing headphones, someone else would need to shout for you to hear them?" </b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

In [None]:
noise_wp_cols = [col for col in df if "f.4825" in col]
loud_music_cols = [col for col in df if "f.4836" in col]

In [None]:
noise_loud_answers = {"No":0, "Yes, for less than a year":1, "Yes, for around 1-5 years":2, "Yes, for more than 5 years":3}

# if the answers we have are not sorted in the order that they're in the list, then that individual is inconsistent
def find_inconsistencies_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()] 
    return sorted(ans) != ans
        

In [None]:
exclude = filtered[noise_wp_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

In [None]:
exclude = filtered[loud_music_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

### 6.4.2. Label Noise and Loud Music

In [None]:
# find individuals that have yes for either noise or loud sounds
def find_label_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()]
    if len(ans) > 0:
        return ans[-1]
    return pd.NA

In [None]:
filtered["noise_wp"] = filtered[noise_wp_cols].apply(find_label_noisy_loud, axis=1)

In [None]:
filtered["noise_wp"] = filtered["noise_wp"].fillna( int(filtered["noise_wp"].median(skipna=True)) )

In [None]:
filtered["loud_music"] = filtered[loud_music_cols].apply(find_label_noisy_loud, axis=1)

In [None]:
filtered["loud_music"] = filtered["loud_music"].fillna( int(filtered["loud_music"].median(skipna=True)) )

In [None]:
filtered

## 6.5. Identify Cases

**Analysis plan:**

1. Individuals who currently have tinnitus (all four yes categories) vs no never only controlling for sex, age, noisy workplace and loud music frequency ("crude") (No tinnitus vs anytype of tinnitus). For this analysis the missing data of the noise variables was imputed using the median for cases and controls separately

3. Individuals in the two top YES categories vs NO never ('No' tinnitus vs 'Yes, now all of the time' and 'Yes, now most of the time')

4. Individuals that say yes in the top 3 categories vs No never (Remove category 'yes, but not now, but have it in the past')

5. Individuals who currently have tinnitus (all four yes categories and tinnitus codes). Not filtering for issues with noisy workplace and loud music.

In [None]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i != "Prefer not to answer" and i in tin_ans.keys() and tin_ans[i] == 1:
            return 1
    return 0

### 6.5.1. Analysis Plan 1

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_1"] = tinn_yes

### 6.5.2. Analysis Plan 2

In [None]:
tin_ans = {"Do not know":9, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_2"] = tinn_yes

### 6.5.3. Analysis Plan 3

In [None]:
tin_ans = {"Do not know":9, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes)

In [None]:
filtered["tinnitus_3"] = tinn_yes

### 6.5.4. Analysis Plan 4

In [None]:
filtered_for4 = saved_tinn

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered_for4[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
filtered_for4["tinnitus_4"] = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report

In [None]:
sum(filtered_for4["tinnitus_4"])

## 6.6. File Output

In [None]:
filtered

In [None]:
filtered_for4

In [None]:
filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)
#filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "ctrl_age"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_1"] == 1][["FID", "IID", "sex", "tinnitus_1", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_1_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_2"] == 1][["FID", "IID", "sex", "tinnitus_2", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_2_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_3"] == 1][["FID", "IID", "sex", "tinnitus_3", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_3_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]].to_csv("tinnitus_4_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]]

# 7. Merge Pheno with Ctrl

## 7.1. f2247, f2257, f3393, and mendilian

In [None]:
ctrl_file_name = "pure_ctrl_pheno_file.tsv"
f3393_file_name = "f3393_pheno_file.tsv"
f2247_file_name = "f2247_pheno_file.tsv"
f2257_file_name = "f2257_pheno_file.tsv"
f2247_f2257_file_name = "f2247_f2257_pheno_file.tsv"
mendilianlike_file_name = "mendelian_pheno_file.tsv"

In [None]:
f3393 = pd.read_csv(f3393_file_name, sep="\t")
f2247 = pd.read_csv(f2247_file_name, sep="\t")
f2257 = pd.read_csv(f2257_file_name, sep="\t")
f2247_f2257 = pd.read_csv(f2247_f2257_file_name, sep="\t")
ctrl = pd.read_csv(ctrl_file_name, sep="\t")
mendlike = pd.read_csv(mendilianlike_file_name, sep="\t")

In [None]:
print("ctrl: ",len(ctrl))
print("f2247: ",len(f2247))
print("f2257: ",len(f2257))
print("f2247_f2257: ",len(f2247_f2257))
print("f3393: ",len(f3393))
print("mendlike: ",len(mendlike))

### 7.1.1. f2247

In [None]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247.columns[3]})
f2247 = f2247.rename(columns={f2247.columns[4]:"age"})

full_pheno = f2247.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_difficulty_f2247_expandedwhite_46237cases_98082ctrl", sep='\t', index=False)

### 7.1.2. f2257

In [None]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2257.columns[3]})
f2257 = f2257.rename(columns={f2257.columns[4]:"age"})

full_pheno = f2257.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_noise_f2257_expandedwhite_66656cases_98082ctrl", sep='\t', index=False)

### 7.1.3. f2247_f2257

In [None]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247_f2257.columns[3]})
f2247_f2257 = f2247_f2257.rename(columns={f2247_f2257.columns[4]:"age"})

full_pheno = f2247_f2257.append(ctrl)
full_pheno.to_csv("080421_UKBB_Combined_f2247_f2257_expandedwhite_39049cases_98082ctrl", sep='\t', index=False)

### 7.1.4. f3393

In [None]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f3393.columns[3]})
f3393 = f3393.rename(columns={f3393.columns[4]:"age"})

full_pheno = f3393.append(ctrl)
full_pheno.to_csv("080421_UKBB_Hearing_aid_f3393_expandedwhite_6305cases_98082ctrl", sep='\t', index=False)

### 7.1.5. Mendelian

In [None]:
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:mendlike.columns[3]})
mendlike = mendlike.rename(columns={mendlike.columns[4]:"age"})

full_pheno = mendlike.append(ctrl)
full_pheno.to_csv("080421_UKBB_Mendelian_expandedwhite_1520cases_98082ctrl", sep='\t', index=False)

## 7.2. Tinnitus

In [None]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_file.tsv"
tinnitus_1_file_name = "tinnitus_1_pheno_file.tsv"
tinnitus_2_file_name = "tinnitus_2_pheno_file.tsv"
tinnitus_3_file_name = "tinnitus_3_pheno_file.tsv"
tinnitus_4_file_name = "tinnitus_4_pheno_file.tsv"

In [None]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t")
tinnitus_1 = pd.read_csv(tinnitus_1_file_name, sep="\t")
tinnitus_2 = pd.read_csv(tinnitus_2_file_name, sep="\t")
tinnitus_3 = pd.read_csv(tinnitus_3_file_name, sep="\t")
tinnitus_4 = pd.read_csv(tinnitus_4_file_name, sep="\t")

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:"age"})
tinnitus_1 = tinnitus_1.rename(columns={tinnitus_1.columns[4]:"age"})
tinnitus_2 = tinnitus_2.rename(columns={tinnitus_2.columns[4]:"age"})
tinnitus_3 = tinnitus_3.rename(columns={tinnitus_3.columns[4]:"age"})
tinnitus_4 = tinnitus_4.rename(columns={tinnitus_4.columns[4]:"age"})

In [None]:
print("ctrl: ",len(tinnitus_ctrl))
print("tinnitus 1: ",len(tinnitus_1))
print("tinnitus 2: ",len(tinnitus_2))
print("tinnitus 3: ",len(tinnitus_3))
print("tinnitus 4: ",len(tinnitus_4))

### 7.2.1. Analysis 1

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_1.columns[3]})
full_tinnitus_1 = tinnitus_1.append(tinnitus_ctrl)
full_tinnitus_1

### 7.2.2. Analysis 2

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_2.columns[3]})
full_tinnitus_2 = tinnitus_2.append(tinnitus_ctrl)
full_tinnitus_2

### 7.2.3. Analysis 3

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_3.columns[3]})
full_tinnitus_3 = tinnitus_3.append(tinnitus_ctrl)
full_tinnitus_3

### 7.2.4. Analysis 4

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_4.columns[3]})
full_tinnitus_4 = tinnitus_4.append(tinnitus_ctrl[tinnitus_ctrl.columns[:-2]])
full_tinnitus_4