# Paper revision and response to the reviewers AJHG

Investigate the number of individuals with code 1415 ear/vestibular disorder in the 200K database

# Generate the phenotypes for the hearing impairment traits from the UKBB

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Read in the data

## 1.1. Read in database

In [2]:
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
    hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
    hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [3]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + ethnicity + reported_sex + genetic_sex + hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257 + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [4]:
print(datetime.now())

2022-06-20 14:21:27.416923


In [5]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,f.2257.1.0,f.2257.2.0,f.2257.3.0,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,,,,No,,,,,,,,1111,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000035,1000035,Male,1944,May,No,,,,Yes,,,,No,,,,,,,,1396,1473,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000078,1000078,Female,1955,June,No,No,No,,No,No,No,,,No,No,,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000081,1000081,Male,1942,February,No,,,,No,,,,No,,,,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1969-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200615,6025319,6025319,Female,1953,March,No,,,,No,,,,No,,,,"No, never",,,,1065,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200616,6025346,6025346,Female,1954,October,No,,,,No,,,,,,,,,,,,1452,1265,1387,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200617,6025363,6025363,Male,1944,April,No,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [50]:
def find_20002(row):
    if "1415" in row.dropna().to_list():
        return True
    return False
lis = df[f20002_colnames].apply(find_20002, axis=1)
lis.sum()

1720

In [52]:
list1415 = df[lis==True]["IID"].to_list()

In [59]:
list1415

['1005452',
 '1009003',
 '1009950',
 '1010797',
 '1016310',
 '1017781',
 '1021442',
 '1022648',
 '1024441',
 '1025320',
 '1032097',
 '1037805',
 '1038985',
 '1040074',
 '1046409',
 '1048444',
 '1049663',
 '1056155',
 '1061443',
 '1065125',
 '1066100',
 '1066367',
 '1066469',
 '1066762',
 '1067761',
 '1071556',
 '1072379',
 '1073039',
 '1078908',
 '1079436',
 '1081239',
 '1081606',
 '1088351',
 '1089685',
 '1092165',
 '1092504',
 '1093633',
 '1095447',
 '1095941',
 '1097955',
 '1098878',
 '1101864',
 '1103372',
 '1110004',
 '1115102',
 '1117452',
 '1123834',
 '1127496',
 '1130828',
 '1137001',
 '1137013',
 '1137624',
 '1139634',
 '1140005',
 '1140122',
 '1142165',
 '1156982',
 '1160229',
 '1163307',
 '1163548',
 '1164003',
 '1172082',
 '1175322',
 '1175615',
 '1176651',
 '1181543',
 '1183645',
 '1190984',
 '1193246',
 '1196363',
 '1196900',
 '1197186',
 '1198542',
 '1198980',
 '1199130',
 '1200363',
 '1201714',
 '1210604',
 '1211422',
 '1213888',
 '1221105',
 '1221861',
 '1233625',
 '12

In [44]:
idlist = pd.DataFrame(columns=["FID","IID"])
file = "/home/dmc2245/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl"
phe = pd.read_csv(file,header=0,sep="\t")
idlist = pd.concat([idlist,phe[["FID","IID"]]],ignore_index=True)
file = "/home/dmc2245/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl"
phe = pd.read_csv(file,header=0,sep="\t")
idlist = pd.concat([idlist,phe[["FID","IID"]]],ignore_index=True)
file = "/home/dmc2245/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl"
phe = pd.read_csv(file,header=0,sep="\t")
idlist = pd.concat([idlist,phe[["FID","IID"]]],ignore_index=True)
file = "/home/dmc2245/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl"
phe = pd.read_csv(file,header=0,sep="\t")
idlist = pd.concat([idlist,phe[["FID","IID"]]],ignore_index=True)
idlist = idlist.drop_duplicates("FID")
idlist = idlist.astype("string")
idlist

Unnamed: 0,FID,IID
0,1001384,1001384
1,1002548,1002548
2,1002888,1002888
3,1002944,1002944
4,1003258,1003258
...,...,...
310792,6024469,6024469
310793,6024535,6024535
310795,6024578,6024578
310796,6024829,6024829


In [61]:
filtered = df[df["IID"].isin(idlist["IID"].to_list())]
filtered[filtered["IID"].isin(list1415)]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,f.2257.1.0,f.2257.2.0,f.2257.3.0,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
227,1005452,1005452,Female,1942,August,Yes,,,,Yes,,,,No,,,,"Yes, now most or all of the time",,,,1537,1415,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
365,1009003,1009003,Female,1950,July,No,,,,No,,,,No,,,,"Yes, but not now, but have in the past",,,,1415,1065,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
440,1010797,1010797,Male,1945,December,Yes,Yes,,,Yes,Yes,,,Yes,Yes,,,,"Yes, now most or all of the time",,,1466,1415,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
679,1016310,1016310,Female,1944,December,No,,,,Yes,,,,No,,,,,,,,1065,99999,99999,1406,1226,1415,1077,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
746,1017781,1017781,Male,1946,February,Yes,Yes,Yes,,Yes,Yes,Yes,,No,No,No,,,"Yes, now some of the time","No, never",,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199840,6005906,6005906,Female,1956,February,Yes,,,,Yes,,,,No,,,,,,,,1094,1415,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200093,6012471,6012471,Male,1961,July,No,,,,No,,,,,,,,,,,,1065,1415,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200389,6020151,6020151,Male,1957,November,Yes,Yes,Yes,,Yes,Yes,Yes,,No,No,No,,,"Yes, now most or all of the time","Yes, now most or all of the time",,1415,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1997-08-21
200517,6023081,6023081,Female,1941,November,Yes,,,,Yes,,,,No,,,,,,,,1065,1471,1078,1352,1415,1473,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [45]:
filtered[filtered["IID"].isin(list1415)].to_csv("cases_controls_with1415code.csv", sep=',', index=False)

In [46]:
file = "/home/dmc2245/UKBiobank/phenotype_files/hearing_impairment/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl"
f3393 = pd.read_csv(file,header=0,sep="\t")

In [48]:
f3393

Unnamed: 0,FID,IID,sex,f3393,age
0,1001384,1001384,1,1,61
1,1002548,1002548,0,1,62
2,1002888,1002888,0,1,68
3,1002944,1002944,0,1,65
4,1003258,1003258,0,1,74
...,...,...,...,...,...
103032,6025251,6025251,0,0,56
103033,6025319,6025319,1,0,56
103034,6025346,6025346,1,0,53
103035,6025363,6025363,0,0,64


In [65]:
f3393[f3393["IID"].isin(list1415)]

Unnamed: 0,FID,IID,sex,f3393,age


In [21]:
def find_20002(row):
    if "1415" in row.dropna().to_list():
        return True
    return False
lis = filtered[f20002_colnames].apply(find_20002, axis=1)
lis.sum()

1316

In [23]:
qc_individuals = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/083021_PCA_results/europeans/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.europeans.filtered.fam", sep="\t", header=None)
outlier = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/083021_PCA_results/090321_PCA_related_pval0.005/030821_ukb42495_exomed_white_189010ind.090321_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)
filtered = df[~ex_sex]

In [24]:
print(sum(ex_sex), "individuals removed because of inconsistency with the genetic and reported sex variables")

233 individuals removed because of inconsistency with the genetic and reported sex variables


In [25]:
set(filtered[ethnicity[0]].to_list()).union( set(filtered[ethnicity[1]].to_list()) , set(filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [26]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"
filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white" and row["ethnicity"] != "Any_other_white_background"
ex_non_white = filtered[["ethnicity"]].apply(find_non_white, axis=1)
filtered = filtered[~ex_non_white]
print(sum(ex_non_white), "individuals removed for being non-white in the exome data")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)


11385 individuals removed for being non-white in the exome data


In [27]:
filtered.shape

(189001, 713)

In [28]:
qc_list = [str(i) for i in qc_individuals[0].to_list()]
def matches_qc_individuals(row):
    return row["FID"] in qc_list
filtered = filtered[filtered[["FID"]].apply(matches_qc_individuals, axis=1)]

In [29]:
out_ids = [str(x) for x in outlier[0].to_list()] 

def find_outliers(row):
    return row["IID"] in out_ids
ex_pca_outliers = filtered[["IID", "FID"]].apply(find_outliers, axis=1)
filtered = filtered[~ex_pca_outliers]
print(sum(ex_pca_outliers), "individuals removed for being pca outliers")

566 individuals removed for being pca outliers


In [30]:
filtered.shape

(187908, 713)

## Find out how many people in our sample has the code 1415

How many of those are cases?

How many of those are controls?