# Generate the phenotypes for the hearing impairment traits from the UKBB for the 500K individuals with imputed data

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime

## Read in database

In [2]:
# collect the necessary column names of the database for our analysis
#This database is already subsetted for white Europeans that passed QC by using Elnaz file 
##/mnt/vast/hpc/csg/UKBiobank/data/ukbb_databases/ukb_673643_JUL2023/IDs_WhiteEU_passed_QC_459241.txt
with open("/mnt/vast/hpc/csg/UKBiobank/data/ukbb_databases/ukb_673643_JUL2023/071923_ukb673643_459241ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    f20002_ages = [col.strip('"') for col in header if "f.20009." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
    hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
    hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [7]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + f20002_ages + ethnicity + reported_sex + genetic_sex + hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257 + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [8]:
print(datetime.now())

2023-07-24 14:32:00.276623


In [9]:
import os
os. getcwd()

'/mnt/mfs/hgrcgrid/homes/dmc2245/project/UKBB_GWAS_dev/analysis/phenotypes'

In [10]:
# database of all individuals that we are working with and the selected phenotypes july2023 update
df = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/data/ukbb_databases/ukb_673643_JUL2023/071923_ukb673643_459241ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


In [11]:
print(datetime.now())

2023-07-24 14:41:15.563527


In [12]:
### saving the working file
df.to_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/071923_ukb673643_459241ind_call90_subsetvars.csv",index=False)

### Reading working file (subset of original pheno file)

In [13]:
list(df.columns)

['IID',
 'FID',
 'f.31.0.0',
 'f.34.0.0',
 'f.52.0.0',
 'f.2247.0.0',
 'f.2247.1.0',
 'f.2247.2.0',
 'f.2247.3.0',
 'f.2257.0.0',
 'f.2257.1.0',
 'f.2257.2.0',
 'f.2257.3.0',
 'f.3393.0.0',
 'f.3393.1.0',
 'f.3393.2.0',
 'f.3393.3.0',
 'f.4803.0.0',
 'f.4803.1.0',
 'f.4803.2.0',
 'f.4803.3.0',
 'f.20002.0.0',
 'f.20002.0.1',
 'f.20002.0.2',
 'f.20002.0.3',
 'f.20002.0.4',
 'f.20002.0.5',
 'f.20002.0.6',
 'f.20002.0.7',
 'f.20002.0.8',
 'f.20002.0.9',
 'f.20002.0.10',
 'f.20002.0.11',
 'f.20002.0.12',
 'f.20002.0.13',
 'f.20002.0.14',
 'f.20002.0.15',
 'f.20002.0.16',
 'f.20002.0.17',
 'f.20002.0.18',
 'f.20002.0.19',
 'f.20002.0.20',
 'f.20002.0.21',
 'f.20002.0.22',
 'f.20002.0.23',
 'f.20002.0.24',
 'f.20002.0.25',
 'f.20002.0.26',
 'f.20002.0.27',
 'f.20002.0.28',
 'f.20002.0.29',
 'f.20002.0.30',
 'f.20002.0.31',
 'f.20002.0.32',
 'f.20002.0.33',
 'f.20002.1.0',
 'f.20002.1.1',
 'f.20002.1.2',
 'f.20002.1.3',
 'f.20002.1.4',
 'f.20002.1.5',
 'f.20002.1.6',
 'f.20002.1.7',
 'f.20002

In [14]:
df["f.20002.0.0"].value_counts(dropna=False)

NaN     115043
1065    103596
1111     34508
1465     10777
1075     10269
         ...  
1236         1
1496         1
1521         1
1621         1
1678         1
Name: f.20002.0.0, Length: 441, dtype: Int64

In [15]:
#df["f.20009.0.0"]

In [16]:
non_cancer = df [["IID","f.20002.0.0","f.20009.0.0","f.20002.1.0", "f.20009.1.0"]]
non_cancer

Unnamed: 0,IID,f.20002.0.0,f.20009.0.0,f.20002.1.0,f.20009.1.0
0,1000019,1111,47.1,,
1,1000022,1065,43.5,,
2,1000035,1396,62.2,,
3,1000046,1065,61.5,,
4,1000054,,,,
...,...,...,...,...,...
459151,6025390,1464,65.3,,
459152,6025409,1478,39.6,1627,23.6
459153,6025411,,,,
459154,6025425,1265,22.5,,


In [17]:
ICD10 = df [["IID","f.41270.0.0", "f.41280.0.0","f.41270.0.1", "f.41280.0.1"]]
ICD10

Unnamed: 0,IID,f.41270.0.0,f.41280.0.0,f.41270.0.1,f.41280.0.1
0,1000019,E041,2013-11-21,F329,2020-03-02
1,1000022,F101,2015-07-31,J342,2002-03-14
2,1000035,H269,2014-05-10,K579,2007-02-14
3,1000046,E780,2009-10-30,G473,2014-05-28
4,1000054,C679,2014-01-07,C787,2014-04-08
...,...,...,...,...,...
459151,6025390,A049,2021-01-13,A099,2020-12-01
459152,6025409,,,,
459153,6025411,O149,1997-05-13,O266,1997-05-13
459154,6025425,G551,2009-02-25,G558,2018-08-08


In [18]:
icd9_colnames

['f.41271.0.0',
 'f.41271.0.1',
 'f.41271.0.2',
 'f.41271.0.3',
 'f.41271.0.4',
 'f.41271.0.5',
 'f.41271.0.6',
 'f.41271.0.7',
 'f.41271.0.8',
 'f.41271.0.9',
 'f.41271.0.10',
 'f.41271.0.11',
 'f.41271.0.12',
 'f.41271.0.13',
 'f.41271.0.14',
 'f.41271.0.15',
 'f.41271.0.16',
 'f.41271.0.17',
 'f.41271.0.18',
 'f.41271.0.19',
 'f.41271.0.20',
 'f.41271.0.21',
 'f.41271.0.22',
 'f.41271.0.23',
 'f.41271.0.24',
 'f.41271.0.25',
 'f.41271.0.26',
 'f.41271.0.27',
 'f.41271.0.28',
 'f.41271.0.29',
 'f.41271.0.30',
 'f.41271.0.31',
 'f.41271.0.32',
 'f.41271.0.33',
 'f.41271.0.34',
 'f.41271.0.35',
 'f.41271.0.36',
 'f.41271.0.37',
 'f.41271.0.38',
 'f.41271.0.39',
 'f.41271.0.40',
 'f.41271.0.41',
 'f.41271.0.42',
 'f.41271.0.43',
 'f.41271.0.44',
 'f.41271.0.45',
 'f.41271.0.46']

In [19]:
ICD9 = df [["IID","f.41271.0.0", "f.41281.0.0","f.41271.0.1", "f.41281.0.1"]]
ICD9

Unnamed: 0,IID,f.41271.0.0,f.41281.0.0,f.41271.0.1,f.41281.0.1
0,1000019,,,,
1,1000022,,,,
2,1000035,,,,
3,1000046,,,,
4,1000054,3000,1981-01-14,5198,1981-01-14
...,...,...,...,...,...
459151,6025390,,,,
459152,6025409,,,,
459153,6025411,,,,
459154,6025425,,,,


## Estimating age for each ICD9/10

In [20]:
###changing brithday to date, need to add day for each individuals
for index, row in df.iterrows():
     df.at[index, 'day'] = 1

  self.obj[key] = infer_fill_value(value)


In [21]:
BD = df [["IID","f.34.0.0", "f.52.0.0", "day"]]
BD

Unnamed: 0,IID,f.34.0.0,f.52.0.0,day
0,1000019,1960,November,1.0
1,1000022,1954,August,1.0
2,1000035,1944,May,1.0
3,1000046,1946,March,1.0
4,1000054,1942,January,1.0
...,...,...,...,...
459151,6025390,1942,March,1.0
459152,6025409,1946,November,1.0
459153,6025411,1960,November,1.0
459154,6025425,1963,August,1.0


In [27]:
###Convering month names to number
df.loc[(df['f.52.0.0'] == 'January') , 'month'] = 1 
df.loc[(df['f.52.0.0'] == 'February') , 'month'] = 2
df.loc[(df['f.52.0.0'] == 'March') , 'month'] = 3
df.loc[(df['f.52.0.0'] == 'April') , 'month'] = 4
df.loc[(df['f.52.0.0'] == 'May') , 'month'] = 5
df.loc[(df['f.52.0.0'] == 'June') , 'month'] = 6
df.loc[(df['f.52.0.0'] == 'July') , 'month'] = 7
df.loc[(df['f.52.0.0'] == 'August') , 'month'] = 8
df.loc[(df['f.52.0.0'] == 'September') , 'month'] = 9
df.loc[(df['f.52.0.0'] == 'October') , 'month'] = 10
df.loc[(df['f.52.0.0'] == 'November') , 'month'] = 11
df.loc[(df['f.52.0.0'] == 'December') , 'month'] = 12
df

  self.obj[key] = infer_fill_value(value)


Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,day,month
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,1.0,11.0
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,1.0,8.0
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,1.0,5.0
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,1.0,3.0
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,1.0,3.0
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,1.0,11.0
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,1.0,11.0
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,1.0,8.0


In [22]:
### Combining three columns, year (f.31.0.0), month and day to generate one column as date of birth

In [28]:
import datetime
import time

In [35]:
df['Date']= pd.to_datetime(dict(year=df['f.34.0.0'], month=df["month"], day=df['day']))
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,day,month,Date
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,1.0,11.0,1960-11-01
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,1.0,8.0,1954-08-01
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,1.0,5.0,1944-05-01
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,1.0,3.0,1946-03-01
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,1.0,1.0,1942-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,1.0,3.0,1942-03-01
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,1.0,11.0,1946-11-01
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,1.0,11.0,1960-11-01
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,1.0,8.0,1963-08-01


In [36]:
### convert BD to date format
df['Date']= pd.to_datetime(df['Date'])

In [39]:
icd9_dates = [i for i in df if i.startswith('f.41281.0.')]

In [40]:
icd9_dates

['f.41281.0.0',
 'f.41281.0.1',
 'f.41281.0.2',
 'f.41281.0.3',
 'f.41281.0.4',
 'f.41281.0.5',
 'f.41281.0.6',
 'f.41281.0.7',
 'f.41281.0.8',
 'f.41281.0.9',
 'f.41281.0.10',
 'f.41281.0.11',
 'f.41281.0.12',
 'f.41281.0.13',
 'f.41281.0.14',
 'f.41281.0.15',
 'f.41281.0.16',
 'f.41281.0.17',
 'f.41281.0.18',
 'f.41281.0.19',
 'f.41281.0.20',
 'f.41281.0.21',
 'f.41281.0.22',
 'f.41281.0.23',
 'f.41281.0.24',
 'f.41281.0.25',
 'f.41281.0.26',
 'f.41281.0.27',
 'f.41281.0.28',
 'f.41281.0.29',
 'f.41281.0.30',
 'f.41281.0.31',
 'f.41281.0.32',
 'f.41281.0.33',
 'f.41281.0.34',
 'f.41281.0.35',
 'f.41281.0.36',
 'f.41281.0.37',
 'f.41281.0.38',
 'f.41281.0.39',
 'f.41281.0.40',
 'f.41281.0.41',
 'f.41281.0.42',
 'f.41281.0.43',
 'f.41281.0.44',
 'f.41281.0.45',
 'f.41281.0.46']

### Estimate the age of inviduals when they report any ICD9 codes

In [41]:
### change class for all ICD9 dates
for x in range(0,47):
    colname = "f.41281.0.{}".format(x)
    df[colname] = pd.to_datetime(df[colname])

In [43]:
df[icd9_colnames]

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,3000,5198,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,,,,,,,,,,,...,,,,,,,,,,
459152,,,,,,,,,,,...,,,,,,,,,,
459153,,,,,,,,,,,...,,,,,,,,,,
459154,,,,,,,,,,,...,,,,,,,,,,


In [45]:
## estimating days differences between BD and ICD9 report date
for x in range(0,47):
    colname = "f.41281.0.{}".format(x)
    newcolname = "age_f.41281.0.{}".format(x)
    df[newcolname] = (df[colname] - df['Date']) / np.timedelta64(1, 'Y')

  df[newcolname] = (df[colname] - df['Date']) / np.timedelta64(1, 'Y')


In [46]:
df['age_f.41281.0.0'].dtype

dtype('float64')

In [47]:
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41281.0.37,age_f.41281.0.38,age_f.41281.0.39,age_f.41281.0.40,age_f.41281.0.41,age_f.41281.0.42,age_f.41281.0.43,age_f.41281.0.44,age_f.41281.0.45,age_f.41281.0.46
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


In [48]:
test = df [["IID","f.41271.0.0", "age_f.41281.0.0","f.41271.0.1", "age_f.41281.0.1"]]
test

Unnamed: 0,IID,f.41271.0.0,age_f.41281.0.0,f.41271.0.1,age_f.41281.0.1
0,1000019,,,,
1,1000022,,,,
2,1000035,,,,
3,1000046,,,,
4,1000054,3000,39.037078,5198,39.037078
...,...,...,...,...,...
459151,6025390,,,,
459152,6025409,,,,
459153,6025411,,,,
459154,6025425,,,,


### Estimate the age of inviduals when they report any ICD10 codes

In [49]:
icd10_dates = [i for i in df if i.startswith('f.41280.0.')]

In [50]:
icd10_dates

['f.41280.0.0',
 'f.41280.0.1',
 'f.41280.0.2',
 'f.41280.0.3',
 'f.41280.0.4',
 'f.41280.0.5',
 'f.41280.0.6',
 'f.41280.0.7',
 'f.41280.0.8',
 'f.41280.0.9',
 'f.41280.0.10',
 'f.41280.0.11',
 'f.41280.0.12',
 'f.41280.0.13',
 'f.41280.0.14',
 'f.41280.0.15',
 'f.41280.0.16',
 'f.41280.0.17',
 'f.41280.0.18',
 'f.41280.0.19',
 'f.41280.0.20',
 'f.41280.0.21',
 'f.41280.0.22',
 'f.41280.0.23',
 'f.41280.0.24',
 'f.41280.0.25',
 'f.41280.0.26',
 'f.41280.0.27',
 'f.41280.0.28',
 'f.41280.0.29',
 'f.41280.0.30',
 'f.41280.0.31',
 'f.41280.0.32',
 'f.41280.0.33',
 'f.41280.0.34',
 'f.41280.0.35',
 'f.41280.0.36',
 'f.41280.0.37',
 'f.41280.0.38',
 'f.41280.0.39',
 'f.41280.0.40',
 'f.41280.0.41',
 'f.41280.0.42',
 'f.41280.0.43',
 'f.41280.0.44',
 'f.41280.0.45',
 'f.41280.0.46',
 'f.41280.0.47',
 'f.41280.0.48',
 'f.41280.0.49',
 'f.41280.0.50',
 'f.41280.0.51',
 'f.41280.0.52',
 'f.41280.0.53',
 'f.41280.0.54',
 'f.41280.0.55',
 'f.41280.0.56',
 'f.41280.0.57',
 'f.41280.0.58',
 'f.412

In [51]:
### change class for all ICD10 dates
for x in range(0,259):
    colname_ICD10 = "f.41280.0.{}".format(x)
    df[colname_ICD10] = pd.to_datetime(df[colname_ICD10])

In [52]:
df[colname_ICD10].dtype

dtype('<M8[ns]')

In [53]:
## estimating days differences between BD and ICD10 report date
for x in range(0,259):
    colname_ICD10 = "f.41280.0.{}".format(x)
    newcolname_ICD10 = "age_f.41280.0.{}".format(x)
    df[newcolname_ICD10] = (df[colname_ICD10] - df['Date']) / np.timedelta64(1, 'Y')

  df[newcolname_ICD10] = (df[colname_ICD10] - df['Date']) / np.timedelta64(1, 'Y')


In [54]:
df[['age_f.41280.0.0']]

Unnamed: 0,age_f.41280.0.0
0,53.055162
1,60.997830
2,70.024710
3,63.667290
4,72.017906
...,...
459151,78.873625
459152,
459153,36.529155
459154,45.572462


In [55]:
ICD10_test = df [["IID","Date", "f.41270.0.0", "f.41280.0.0","age_f.41280.0.0","f.41270.0.1","f.41280.0.1", "age_f.41280.0.1", "f.41270.0.100","f.41280.0.100", "age_f.41280.0.100"]]

In [56]:
ICD10_test

Unnamed: 0,IID,Date,f.41270.0.0,f.41280.0.0,age_f.41280.0.0,f.41270.0.1,f.41280.0.1,age_f.41280.0.1,f.41270.0.100,f.41280.0.100,age_f.41280.0.100
0,1000019,1960-11-01,E041,2013-11-21,53.055162,F329,2020-03-02,59.333183,,NaT,
1,1000022,1954-08-01,F101,2015-07-31,60.997830,J342,2002-03-14,47.617679,,NaT,
2,1000035,1944-05-01,H269,2014-05-10,70.024710,K579,2007-02-14,62.791159,,NaT,
3,1000046,1946-03-01,E780,2009-10-30,63.667290,G473,2014-05-28,68.242332,,NaT,
4,1000054,1942-01-01,C679,2014-01-07,72.017906,C787,2014-04-08,72.267055,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,1942-03-01,A049,2021-01-13,78.873625,A099,2020-12-01,78.755895,,NaT,
459152,6025409,1946-11-01,,NaT,,,NaT,,,NaT,
459153,6025411,1960-11-01,O149,1997-05-13,36.529155,O266,1997-05-13,36.529155,,NaT,
459154,6025425,1963-08-01,G551,2009-02-25,45.572462,G558,2018-08-08,55.020979,,NaT,


In [43]:
### saving the working file wirh ages for ICD9
#df.to_csv("/mnt/vast/hpc/csg/en2509/Phenotype/500k.ARHL.phe_ICD9_10_ages.csv",index=False)

In [57]:
df["f.41280.0.100"].value_counts(dropna=False)

NaT           457811
2020-05-27         4
2019-11-26         4
2020-11-02         4
2020-06-30         3
               ...  
2007-04-04         1
1999-10-30         1
2016-06-22         1
2004-11-15         1
2014-10-08         1
Name: f.41280.0.100, Length: 1230, dtype: int64

In [58]:
df["age_f.41280.0.100"].value_counts(dropna=False)

NaN          457811
72.083616         3
81.942819         2
60.882838         2
62.785683         2
              ...  
43.434157         1
74.090502         1
68.910381         1
72.759879         1
74.101453         1
Name: age_f.41280.0.100, Length: 1268, dtype: int64

In [6]:
### keep white European. In this working file I've already subsetted the white european
#white = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.pheno", sep = '\t', engine = 'python')

In [59]:
### chnaging string to int before merging 
df['IID'] = df['IID'].astype(int)
df['FID'] = df['FID'].astype(int)

In [8]:
#df_white = white.merge(df, on=['IID','FID'], how='inner')

In [9]:
#df_white

Unnamed: 0,FID,IID,ethnicity,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,age_f.41280.0.216,age_f.41280.0.217,age_f.41280.0.218,age_f.41280.0.219,age_f.41280.0.220,age_f.41280.0.221,age_f.41280.0.222,age_f.41280.0.223,age_f.41280.0.224,age_f.41280.0.225
0,1000019,1000019,British,Female,1960,November,Yes,,,,...,,,,,,,,,,
1,1000022,1000022,British,Male,1954,August,Yes,,,,...,,,,,,,,,,
2,1000035,1000035,British,Male,1944,May,No,,,,...,,,,,,,,,,
3,1000046,1000046,British,Female,1946,March,No,,No,,...,,,,,,,,,,
4,1000054,1000054,British,Female,1942,January,No,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460644,6025390,6025390,British,Female,1942,March,No,,,,...,,,,,,,,,,
460645,6025409,6025409,British,Female,1946,November,No,No,,,...,,,,,,,,,,
460646,6025411,6025411,British,Female,1960,November,No,,,,...,,,,,,,,,,
460647,6025425,6025425,British,Female,1963,August,No,,,,...,,,,,,,,,,


In [10]:
### saving the working file wirh ages for ICD9 and white Europeans
#df_white.to_csv("/mnt/vast/hpc/csg/en2509/Phenotype/500k.ARHL.phe_ICD9_10_ages_whiteEuro_EN.csv",index=False)

In [20]:
#df = pd.read_csv("/mnt/vast/hpc/csg/en2509/Phenotype/500k.ARHL.phe_ICD9_10_ages_whiteEuro_EN.csv", quotechar = '"', dtype="string")
#df

Unnamed: 0,FID,IID,ethnicity,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,age_f.41280.0.216,age_f.41280.0.217,age_f.41280.0.218,age_f.41280.0.219,age_f.41280.0.220,age_f.41280.0.221,age_f.41280.0.222,age_f.41280.0.223,age_f.41280.0.224,age_f.41280.0.225
0,1000019,1000019,British,Female,1960,November,Yes,,,,...,,,,,,,,,,
1,1000022,1000022,British,Male,1954,August,Yes,,,,...,,,,,,,,,,
2,1000035,1000035,British,Male,1944,May,No,,,,...,,,,,,,,,,
3,1000046,1000046,British,Female,1946,March,No,,No,,...,,,,,,,,,,
4,1000054,1000054,British,Female,1942,January,No,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460644,6025390,6025390,British,Female,1942,March,No,,,,...,,,,,,,,,,
460645,6025409,6025409,British,Female,1946,November,No,No,,,...,,,,,,,,,,
460646,6025411,6025411,British,Female,1960,November,No,,,,...,,,,,,,,,,
460647,6025425,6025425,British,Female,1963,August,No,,,,...,,,,,,,,,,


In [60]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/092821_PCA_related_pval0.005/ukb47922_white_460649ind.092821_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1003423,1003423
1,1008606,1008606
2,1009852,1009852
3,1010412,1010412
4,1010678,1010678
...,...,...
1377,5801962,5801962
1378,5807807,5807807
1379,5809112,5809112
1380,5833189,5833189


In [61]:
outlier.columns =['FID', 'IID']

In [62]:
list(outlier.columns)

['FID', 'IID']

In [63]:
df.IID.dtype

dtype('int64')

In [64]:
outlier.IID.dtype

dtype('int64')

In [65]:
Filtered_PCA = df[~df.loc[:,'IID'].isin(outlier['IID'])]


In [66]:
Filtered_PCA

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.249,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


In [67]:
### saving the working file wirh ages for ICD9, white Europeans and after removing PCA outliers
Filtered_PCA.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/071923_ukb673643_459241ind_call90_subsetvars_ICD9_10_ages_whiteEuro_PCAout.csv",index=False)

In [None]:
### saving the working file wirh ages for ICD9, white Europeans and after removing PCA outliers
#Filtered_PCA_Exclude.to_csv("/mnt/vast/hpc/csg/en2509/Phenotype/500k.IC9_ICD10_self_reported_EN.csv",index=False)

## From here, filter dataframe which have age less than 40 years for any of ICD9, ICD10 or self-reported 

In [68]:
#df = pd.read_csv("/mnt/vast/hpc/csg/en2509/Phenotype/500k.IC9_ICD10_self_reported_EN.csv", quotechar = '"', index_col = "IID")
df = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/071923_ukb673643_459241ind_call90_subsetvars_ICD9_10_ages_whiteEuro_PCAout.csv", quotechar = '"')
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.249,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


In [69]:
#list(df.columns)

In [70]:
df["f.41270.0.0"].value_counts() #categories in ICD10

A099     18147
E780     15615
I10      12603
E039     10544
C61       9038
         ...  
G620         1
R788         1
M0604        1
L981         1
O074         1
Name: f.41270.0.0, Length: 4863, dtype: int64

In [71]:
df["f.41271.0.0"].value_counts() #categories in ICD9

V252       795
4549       554
4549.0     324
2189       299
6262       243
          ... 
2221.0       1
3594.0       1
721.0        1
72709.0      1
5343         1
Name: f.41271.0.0, Length: 2625, dtype: int64

In [72]:
df["f.20002.0.0"].value_counts() # Categories of non-cancer illness

1065.0    103596
1111.0     34508
1465.0     10777
1075.0     10269
1074.0      9873
           ...  
1496.0         1
1521.0         1
1468.0         1
1621.0         1
1678.0         1
Name: f.20002.0.0, Length: 440, dtype: int64

In [73]:
def self(number):
    if  number == 1415:
        return "self"
    else:
        return "Not Sure"

In [74]:
selfreport_colnames = [col for col in df if "f.20002" in col]
for col in selfreport_colnames:
    df["test"] = df[col].apply(self)

In [75]:
df["test"].value_counts() 

Not Sure    459156
Name: test, dtype: int64

In [76]:
mask_list = []

for i in range(32):
    mask_list.append(df[f"f.20002.0.{i}"] == 1415)
    mask_list.append(df[f"f.20002.1.{i}"] == 1415)
    mask_list.append(df[f"f.20002.2.{i}"] == 1415)
    mask_list.append(df[f"f.20002.3.{i}"] == 1415)

combined_mask = pd.concat(mask_list, axis=1).any(axis=1)
df_1415_all_visits = df[combined_mask]
df_1415_all_visits

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
67,1000767,1000767,Male,1959,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
259,1002847,1002847,Male,1953,November,No,,,,No,...,,,,,,,,,,Not Sure
388,1004257,1004257,Male,1939,December,Yes,,,,Yes,...,,,,,,,,,,Not Sure
496,1005452,1005452,Female,1942,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
814,1009003,1009003,Female,1950,July,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458718,6020619,6020619,Female,1944,June,No,,,,No,...,,,,,,,,,,Not Sure
458842,6021968,6021968,Female,1940,December,Yes,,,,Yes,...,,,,,,,,,,Not Sure
458945,6023081,6023081,Female,1941,November,Yes,,,,Yes,...,,,,,,,,,,Not Sure
459022,6023914,6023914,Female,1955,February,Yes,,,,No,...,,,,,,,,,,Not Sure


In [77]:
mask_list = []

for i in range(32):
    mask_list.append(df_1415_all_visits[f"f.20009.0.{i}"] < 40)
    mask_list.append(df_1415_all_visits[f"f.20009.1.{i}"] < 40)
    mask_list.append(df_1415_all_visits[f"f.20009.2.{i}"] < 40)
    mask_list.append(df_1415_all_visits[f"f.20009.3.{i}"] < 40)

combined_mask = pd.concat(mask_list, axis=1).any(axis=1)
df_1415_all_visits_age_less_40 = df_1415_all_visits[combined_mask]
df_1415_all_visits_age_less_40

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
67,1000767,1000767,Male,1959,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
388,1004257,1004257,Male,1939,December,Yes,,,,Yes,...,,,,,,,,,,Not Sure
496,1005452,1005452,Female,1942,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
895,1009880,1009880,Female,1955,October,No,,,,No,...,,,,,,,,,,Not Sure
901,1009950,1009950,Male,1950,October,Yes,Yes,Yes,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458016,6012912,6012912,Female,1947,June,Yes,,,,Yes,...,,,,,,,,,,Not Sure
458154,6014453,6014453,Male,1939,June,Yes,,,,Yes,...,,,,,,,,,,Not Sure
458675,6020151,6020151,Male,1957,November,Yes,Yes,Yes,,Yes,...,,,,,,,,,,Not Sure
458718,6020619,6020619,Female,1944,June,No,,,,No,...,,,,,,,,,,Not Sure


In [22]:
### saving individuals with 1415 codes and younger than 40 years old (N= 3,923 ind)
### these individuals should remove from ARHL but not tinnitus analysis
#df_1415_all_visits_age_less_40.to_csv("/mnt/vast/hpc/csg/en2509/Phenotype/ind_1415_younger_than_40yo_EN.csv",index=False)

# For ICD10 interested codes

In [78]:
cols = df.filter(like='f.41270.0.').columns
mask = df[cols].isin(["H903", "H904", "H905", "H906", "H907", "H908", "H918", "H919"]).any(axis=1)
df_ICD10_all_visits = df[mask]
df_ICD10_all_visits

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
55,1000611,1000611,Female,1953,March,No,,,,No,...,,,,,,,,,,Not Sure
95,1001067,1001067,Male,1959,April,Yes,,,,Yes,...,,,,,,,,,,Not Sure
108,1001196,1001196,Male,1949,September,Yes,,Yes,,Yes,...,,,,,,,,,,Not Sure
133,1001459,1001459,Male,1944,January,Yes,,,,Yes,...,,,,,,,,,,Not Sure
232,1002548,1002548,Male,1948,January,Yes,,,,Yes,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459056,6024282,6024282,Female,1953,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
459061,6024352,6024352,Female,1939,October,Yes,,,,Yes,...,,,,,,,,,,Not Sure
459069,6024448,6024448,Female,1951,July,Yes,,,,Yes,...,,,,,,,,,,Not Sure
459104,6024829,6024829,Female,1952,February,Do not know,,,,Yes,...,,,,,,,,,,Not Sure


In [79]:
from functools import reduce

columns = [f"f.41270.0.{i}" for i in range(259)]
columns_age = [f"f.41280.0.{i}" for i in range(259)]
masks = [df[col].isin(["H903", "H904", "H905", "H906", "H907", "H908", "H918", "H919"]) for col in columns]
Amasks = [df[f"age_{col}"] < 40 for col in columns_age]

df_ICD10_all_visits_age_less_than40 = df[reduce(lambda x, y: x | y, masks) & reduce(lambda x, y: x | y, Amasks)]
df_ICD10_all_visits_age_less_than40


Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1913,1021134,1021134,Female,1965,February,No,,,,No,...,,,,,,,,,,Not Sure
2436,1026921,1026921,Female,1964,July,Yes,,,,No,...,,,,,,,,,,Not Sure
2493,1027574,1027574,Female,1961,March,Yes,,,,Yes,...,,,,,,,,,,Not Sure
3337,1036744,1036744,Female,1967,May,Yes,,,,Yes,...,,,,,,,,,,Not Sure
3458,1038061,1038061,Female,1967,September,Yes,,,,Yes,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456652,5998064,5998064,Female,1967,December,Yes,,,,Yes,...,,,,,,,,,,Not Sure
456830,6000020,6000020,Female,1967,July,Do not know,,,,Yes,...,,,,,,,,,,Not Sure
457448,6006842,6006842,Female,1960,May,No,,,,No,...,,,,,,,,,,Not Sure
457663,6009163,6009163,Male,1967,October,Yes,,,,Yes,...,,,,,,,,,,Not Sure


In [80]:
df_ICD10_all_visits_age_less_than40.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643ind_ICD10_younger_than_40yo_DMCS.csv",index=False)

#### Combine IDs of ICD10 and selfreported who have less than 40 years old and should be excluded


In [81]:
IDs_ICD10_all_visits_age_less_than40 = df_ICD10_all_visits_age_less_than40[["IID", "FID"]]
IDs_1415_all_visits_age_less_40 = df_1415_all_visits_age_less_40[["IID", "FID"]]


In [82]:
frames = [IDs_ICD10_all_visits_age_less_than40, IDs_1415_all_visits_age_less_40]
Combine = pd.concat(frames)
Combine 

Unnamed: 0,IID,FID
1913,1021134,1021134
2436,1026921,1026921
2493,1027574,1027574
3337,1036744,1036744
3458,1038061,1038061
...,...,...
458016,6012912,6012912
458154,6014453,6014453
458675,6020151,6020151
458718,6020619,6020619


In [83]:
Unique_IDs= Combine.drop_duplicates(subset=['IID'], keep=False)
Unique_IDs

Unnamed: 0,IID,FID
1913,1021134,1021134
2436,1026921,1026921
2493,1027574,1027574
3337,1036744,1036744
3458,1038061,1038061
...,...,...
458016,6012912,6012912
458154,6014453,6014453
458675,6020151,6020151
458718,6020619,6020619


In [84]:
Unique_IDs.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_IDs_1415_ICD10_younger_than_40yo_DMCS.csv",index=False)

In [85]:
#Remove individuals younger than 40 years for ICD10 codes and selfrepoerted from main working dataframe
White_EU_PCA_40years = df[~df.loc[:,'IID'].isin(Unique_IDs['IID'])]

In [86]:
White_EU_PCA_40years

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,Not Sure
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459151,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
459152,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
459153,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
459154,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [87]:
# saving main working dataframe after exluding individulas younger than 40 for ICD10 and selfreported 
White_EU_PCA_40years.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_500k.ARHL.phe_ICD9_10_ages_whiteEuro_PCAout_Olderthan_40_DMCS.csv",index=False)

#### Read clean dataframe (after age filtering) and exclusion other criteria for icd10, icd9, and self-report

In [88]:
df = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_500k.ARHL.phe_ICD9_10_ages_whiteEuro_PCAout_Olderthan_40_DMCS.csv", quotechar = '"', dtype="string")
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,Not Sure
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [89]:
exclusion = pd.read_csv("/mnt/vast/hpc/csg/en2509/Phenotype/ICD10_9_selfreport_incl_excl_01252023_EN.csv", quotechar = '"', dtype="string")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32,,,N,N,,,
3,f.41270,H60.1 Cellulitis of external ear,218,,,N,N,,,
4,f.41270,H60.2 Malignant otitis externa,49,,,N,N,,,
...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218,,,Y,,,,
567,f.20002,1583 ischaemic stroke,44,,,N,N,,,
568,f.20002,1082 transient ischaemic attack (tia),2243,,,N,N,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212,,,Y,,,,


If individuals have certain codes from ICD 10, ICD 9, and self-reports they must be fully removed from the analysis. 

In [90]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

### Filter out ICD 10 exclusions

In [91]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in df if "f.41270" in col]
icd10_colnames_ages = [col for col in df if "age_f.41280" in col]

In [92]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = df[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.249,f.41270.0.250,f.41270.0.251,f.41270.0.252,f.41270.0.253,f.41270.0.254,f.41270.0.255,f.41270.0.256,f.41270.0.257,f.41270.0.258
0,E041,F329,H738,M179,M202,M2320,M750,M754,M758,N898,...,,,,,,,,,,
1,F101,J342,R619,S8280,W010,,,,,,...,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,...,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,A049,A099,A419,B370,C859,D649,E278,E279,E538,E559,...,,,,,,,,,,
455541,,,,,,,,,,,...,,,,,,,,,,
455542,O149,O266,O342,O471,O48,O610,O680,Z370,,,...,,,,,,,,,,
455543,G551,G558,I10,M501,Z981,,,,,,...,,,,,,,,,,


In [93]:
# get rows from exclusion database that contian the codes that need to be removed for icd10
exclude_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
27,f.41270,H65.2 Chronic serous otitis media,103,,,Y,,,,
28,f.41270,H65.3 Chronic mucoid otitis media,960,,,Y,,,,
29,f.41270,H65.4 Other chronic nonsuppurative otitis media,158,,,Y,,,,
30,f.41270,"H65.9 Nonsuppurative otitis media, unspecified",508,,,Y,,,,
33,f.41270,H66.1 Chronic tubotympanic suppurative otitis ...,40,,,Y,,,,
...,...,...,...,...,...,...,...,...,...,...
276,f.41270,"S07.9 Crushing injury of head, part unspecified",1,,,Y,,,,
279,f.41270,S08.1 Traumatic amputation of ear,13,,,Y,,,,
280,f.41270,S08.8 Traumatic amputation of other parts of head,1,,,Y,,,,
281,f.41270,S08.9 Traumatic amputation of unspecified part...,1,,,Y,,,,


In [94]:
# get the icd10 codes that should be excluded from database
ex_critia_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd10["Phenotype"].tolist()]
ex_critia_icd10

['H652',
 'H653',
 'H654',
 'H659',
 'H661',
 'H662',
 'H663',
 'H664',
 'H669',
 'H680',
 'H701',
 'H702',
 'H708',
 'H709',
 'H71',
 'H731',
 'H738',
 'H739',
 'H740',
 'H741',
 'H742',
 'H743',
 'H748',
 'H749',
 'H750',
 'H758',
 'H800',
 'H801',
 'H802',
 'H808',
 'H809',
 'H810',
 'H830',
 'H831',
 'H832',
 'H900',
 'H901',
 'H902',
 'H910',
 'H913',
 'H930',
 'H933',
 'H940',
 'H948',
 'H950',
 'H951',
 'H958',
 'H959',
 'B020',
 'B021',
 'B022',
 'B023',
 'B027',
 'B028',
 'G000',
 'G001',
 'G002',
 'G003',
 'G008',
 'G009',
 'G01',
 'G020',
 'G021',
 'G028',
 'G030',
 'G031',
 'G032',
 'G038',
 'G039',
 'G040',
 'G041',
 'G042',
 'G048',
 'G049',
 'G050',
 'G051',
 'G052',
 'G058',
 'G060',
 'G061',
 'G062',
 'G07',
 'G08',
 'G09',
 'G510',
 'G511',
 'G512',
 'G513',
 'G514',
 'G518',
 'G519',
 'S0200',
 'S0201',
 'S0210',
 'S0211',
 'S0240',
 'S0241',
 'S0260',
 'S0261',
 'S0270',
 'S0271',
 'S0280',
 'S0281',
 'S0290',
 'S0291',
 'S045',
 'S046',
 'S049',
 'S0600',
 'S0601',

In [95]:
# collect the individuals that should be excluded because of icd10
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [96]:
filtered_ICD10 = df[ex_10]
filtered_ICD10

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,Not Sure
31,1000357,1000357,Female,1940,March,No,,,,No,...,,,,,,,,,,Not Sure
41,1000477,1000477,Male,1959,July,No,,No,,No,...,,,,,,,,,,Not Sure
83,1000947,1000947,Female,1941,August,Yes,,,,No,...,,,,,,,,,,Not Sure
85,1000965,1000965,Female,1954,December,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455490,6024796,6024796,Female,1946,August,No,,,,Yes,...,,,,,,,,,,Not Sure
455504,6024960,6024960,Male,1951,November,Yes,,,,Yes,...,,,,,,,,,,Not Sure
455505,6024974,6024974,Female,1941,May,Yes,,,,Yes,...,,,,,,,,,,Not Sure
455522,6025176,6025176,Female,1950,August,Yes,,,,No,...,,,,,,,,,,Not Sure


In [97]:
ICD10_IDs = filtered_ICD10 [["FID", "IID"]]
ICD10_IDs  

Unnamed: 0,FID,IID
0,1000019,1000019
31,1000357,1000357
41,1000477,1000477
83,1000947,1000947
85,1000965,1000965
...,...,...
455490,6024796,6024796
455504,6024960,6024960
455505,6024974,6024974
455522,6025176,6025176


In [98]:
# remove them from the working database (which is now filtered. df remains unchanged)
filtered = df[~ex_10]

In [99]:
print(sum(ex_10), "individuals removed because of icd10 codes")

13810 individuals removed because of icd10 codes


In [100]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


### Filter out ICD 9 exclusions

In [101]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [102]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,3000,5198.0,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,,,,,,,,,,,...,,,,,,,,,,
455541,,,,,,,,,,,...,,,,,,,,,,
455542,,,,,,,,,,,...,,,,,,,,,,
455543,,,,,,,,,,,...,,,,,,,,,,


In [103]:
# get rows from exclusion database that contian the codes that need to be removed for icd9
exclude_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
319,f.41271,3811 Chronic serous otitis media,8,,,Y,,,,
320,f.41271,3812 Chronic mucoid otitis media,11,,,Y,,,,
321,f.41271,3813 Other and unspecified chronic nonsuppurat...,3,,,Y,,,,
322,f.41271,"3814 Nonsuppurative otitis media, not specifie...",19,,,Y,,,,
323,f.41271,3815 Eustachian salpingitis,0,,,Y,,,,
...,...,...,...,...,...,...,...,...,...,...
516,f.41271,9050 Late effect of fracture of skull and face...,19,,,Y,,,,
526,f.41271,"9259 Crushing injury of face, scalp and neck",2,,,Y,,,,
532,f.41271,9514 Injury to facial nerve,0,,,Y,,,,
533,f.41271,9515 Injury to acoustic nerve,1,,,Y,,,,


In [104]:
# get the icd9 codes that should be excluded from the working database
ex_critia_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd9["Phenotype"].tolist()]
ex_critia_icd9

['3811',
 '3812',
 '3813',
 '3814',
 '3815',
 '3816',
 '3819',
 '3821',
 '3822',
 '3823',
 '3824',
 '3829',
 '3831',
 '3832',
 '3833',
 '3838',
 '3839',
 '3841',
 '3850',
 '3851',
 '3852',
 '3853',
 '3858',
 '3859',
 '3860',
 '3863',
 '3864',
 '3865',
 '3868',
 '3869',
 '3870',
 '3871',
 '3872',
 '3878',
 '3879',
 '3885',
 '3890',
 '0530',
 '0531',
 '0532',
 '0537',
 '0538',
 '3200',
 '3201',
 '3202',
 '3203',
 '3204',
 '3205',
 '3207',
 '3208',
 '3209',
 '3210',
 '3211',
 '3212',
 '3213',
 '3214',
 '3215',
 '3216',
 '3217',
 '3218',
 '3220',
 '3221',
 '3222',
 '3229',
 '3230',
 '3231',
 '3232',
 '3233',
 '3234',
 '3235',
 '3236',
 '3237',
 '3238',
 '3239',
 '3240',
 '3241',
 '3249',
 '3259',
 '3269',
 '3510',
 '3511',
 '3518',
 '3519',
 '8000',
 '8001',
 '8002',
 '8003',
 '8010',
 '8011',
 '8012',
 '8013',
 '8022',
 '8023',
 '8024',
 '8025',
 '8028',
 '8029',
 '8030',
 '8031',
 '8032',
 '8033',
 '8040',
 '8041',
 '8042',
 '8043',
 '8509',
 '8510',
 '8511',
 '8520',
 '8521',
 '8530',
 

In [105]:
# collect the individuals that should be excluded because of icd9
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [106]:
ICD9_ID = filtered[ex_9]
ICD9_IDs = ICD9_ID [["FID", "IID"]]
ICD9_IDs

Unnamed: 0,FID,IID
1356,1015167,1015167
1773,1019691,1019691
4013,1044629,1044629
6027,1067104,1067104
7116,1079051,1079051
...,...,...
451453,5980379,5980379
451858,5984843,5984843
452432,5991165,5991165
453358,6001389,6001389


In [107]:
# remove them from the working database
filtered = filtered[~ex_9]
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [108]:
print(sum(ex_9), "individuals removed because of icd9 codes")

541 individuals removed because of icd9 codes


In [109]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


### Filter out f.20002 exclusions

In [110]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = [col for col in df if "f.20002" in col]

In [111]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = df[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1111.0,,,,,,,,,,...,,,,,,,,,,
1,1065.0,,,,,,,,,,...,,,,,,,,,,
2,1396.0,1473.0,,,,,,,,,...,,,,,,,,,,
3,1065.0,1294.0,1476.0,1473.0,1374.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,1464.0,,,,,,,,,,...,,,,,,,,,,
455541,1478.0,1473.0,,,,,,,,,...,,,,,,,,,,
455542,,,,,,,,,,,...,,,,,,,,,,
455543,1265.0,,,,,,,,,,...,,,,,,,,,,


In [112]:
# get rows from exclusion database that contian the codes that need to be removed for self-report
exclude_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
539,f.20002,1420 otosclerosis,260,,,Y,,,,
540,f.20002,1421 meniere's disease,1553,,,Y,,,,
541,f.20002,1499 labyrinthitis,417,,,Y,,,,
545,f.20002,1244 infection of nervous system,55,,,Y,,,,
546,f.20002,1245 brain abscess/intracranial abscess,79,,,Y,,,,
547,f.20002,1246 encephalitis,348,,,Y,,,,
548,f.20002,1247 meningitis,2214,,,Y,,,,
550,f.20002,1249 cranial nerve problem/palsy,289,,,Y,,,,
551,f.20002,1250 bell's palsy/facial nerve palsy,591,,,Y,,,,
553,f.20002,1240 neurological injury/trauma,130,,,Y,,,,


In [113]:
# get the self-report codes that should be excluded from the working database
ex_critia_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_f20002["Phenotype"].tolist()]
ex_critia_f20002

['1420',
 '1421',
 '1499',
 '1244',
 '1245',
 '1246',
 '1247',
 '1249',
 '1250',
 '1240',
 '1626',
 '1086',
 '1491',
 '1083',
 '1425']

In [114]:
# collect the individuals that should be excluded because of self-report
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)
df_self_report = filtered[ex_f20002]

  df_self_report = filtered[ex_f20002]


In [115]:
Self_report_IDs = df_self_report[["FID", "IID"]]
Self_report_IDs

Unnamed: 0,FID,IID


In [116]:
#print(sum(ex_f20002), "individuals removed because of self-reported codes")

In [117]:
# remove them from the working database
filtered = filtered[~filtered.loc[:,'IID'].isin(df_self_report['IID'])]
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [118]:
# saving main working dataframe after exluding individulas younger than 55 for ICD10 and selfreported and other exclusion critria
filtered.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_500k.ARHL.phe_ICD9_10_ages_whiteEuro_PCAout_Olderthan_40_after_all_exclusion_critria_DMCS.csv",index=False)

In [119]:
##Combine IDs of individulas how sghould be excluded based on ICD10 and ICD9
frames = [ICD10_IDs, ICD9_IDs, Self_report_IDs]
Combine = pd.concat(frames)
Combine

Unnamed: 0,FID,IID
0,1000019,1000019
31,1000357,1000357
41,1000477,1000477
83,1000947,1000947
85,1000965,1000965
...,...,...
451453,5980379,5980379
451858,5984843,5984843
452432,5991165,5991165
453358,6001389,6001389


In [120]:
Unique_IDs= Combine.drop_duplicates(subset=['IID'], keep=False)
Unique_IDs

Unnamed: 0,FID,IID
0,1000019,1000019
31,1000357,1000357
41,1000477,1000477
83,1000947,1000947
85,1000965,1000965
...,...,...
451453,5980379,5980379
451858,5984843,5984843
452432,5991165,5991165
453358,6001389,6001389


In [121]:
Unique_IDs.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/ICD9_10_selfreported_IDs_removed_DMCS.csv",index=False)

In [122]:
Age_matter = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_IDs_1415_ICD10_younger_than_40yo_DMCS.csv", quotechar = '"', dtype="string")
Age_matter

Unnamed: 0,IID,FID
0,1021134,1021134
1,1026921,1026921
2,1027574,1027574
3,1036744,1036744
4,1038061,1038061
...,...,...
3606,6012912,6012912
3607,6014453,6014453
3608,6020151,6020151
3609,6020619,6020619


In [123]:
##Combine IDs of individulas how sghould be excluded based on ICD10 and ICD9
frames = [ICD10_IDs, ICD9_IDs, Self_report_IDs, Age_matter]
Combine = pd.concat(frames)
Combine

Unnamed: 0,FID,IID
0,1000019,1000019
31,1000357,1000357
41,1000477,1000477
83,1000947,1000947
85,1000965,1000965
...,...,...
3606,6012912,6012912
3607,6014453,6014453
3608,6020151,6020151
3609,6020619,6020619


In [124]:
Unique_IDs= Combine.drop_duplicates(subset=['IID'], keep=False)
Unique_IDs

Unnamed: 0,FID,IID
0,1000019,1000019
31,1000357,1000357
41,1000477,1000477
83,1000947,1000947
85,1000965,1000965
...,...,...
3606,6012912,6012912
3607,6014453,6014453
3608,6020151,6020151
3609,6020619,6020619


In [125]:
Unique_IDs.to_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_All_ICD9_10_selfreported_IDs_removed_40yo_DMCS.csv",index=False)

# Phenotype definitions: f.3393, f.2247, f.2257, and f2247_f2257

## Remove inconsistencies or unclear individuals

In [174]:
filtered = pd.read_csv("/mnt/vast/hpc/csg/UKBiobank/phenotype_files/HI_UKBB/072023_ukb673643_500k.ARHL.phe_ICD9_10_ages_whiteEuro_PCAout_Olderthan_40_after_all_exclusion_critria_DMCS.csv", quotechar = '"', dtype="string")
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
0,1000022,1000022,1,1954,8,1.0,,,,1.0,...,,,,,,,,,,Not Sure
1,1000035,1000035,1,1944,5,0.0,,,,1.0,...,,,,,,,,,,Not Sure
2,1000046,1000046,0,1946,3,0.0,,0.0,,0.0,...,,,,,,,,,,Not Sure
3,1000054,1000054,0,1942,1,0.0,,,,1.0,...,,,,,,,,,,Not Sure
4,1000063,1000063,1,1967,4,0.0,,,,0.0,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441189,6025390,6025390,0,1942,3,0.0,,,,1.0,...,,,,,,,,,,Not Sure
441190,6025409,6025409,0,1946,11,0.0,0.0,,,0.0,...,,,,,,,,,,Not Sure
441191,6025411,6025411,0,1960,11,0.0,,,,0.0,...,,,,,,,,,,Not Sure
441192,6025425,6025425,0,1963,8,0.0,,,,0.0,...,,,,,,,,,,Not Sure


Some individuals might be unclear on if they do or do not have hearing difficulties or are inconsistent (found in f.3393, f.2247, and f.2257), in which case they cannot be considered either controls or cases and must be removed.

The conditions for being removed are as follows:
* Saying I don't know after saying either yes or no
* Only saying I don't know or prefer not to say
* Being completely deaf

### Prior to filtering for inconsistencies

<b>Hearing difficulty/problems with background noise</b> <br>
f.2257 = {'Yes': 81218, NA : 513774, 'No': 131091, 'Do not know': 4409, 'Prefer not to answer': 208}

<b>Hearing difficult/problems</b><br>
f.2247 = {'No': 151758, : 513806, 'Yes': 55437, 'Do not know': 9489, 'Prefer not to answer': 171, 'I am completely deaf': 39}

<b>Hearing aid user</b><br>
f.3393 = {'No': 145486, : 577795, 'Yes': 7237, 'Prefer not to answer': 182}

### Setup for inconsistency filtering

In [126]:
print(filtered.dtypes)

IID                  string
FID                  string
f.31.0.0             string
f.34.0.0             string
f.52.0.0             string
                      ...  
age_f.41280.0.255    string
age_f.41280.0.256    string
age_f.41280.0.257    string
age_f.41280.0.258    string
test                 string
Length: 1225, dtype: object


In [128]:
# collect all the columns 
# redefining here for clarity

hearing_imp_f3393 = [col for col in filtered if "f.3393" in col]
hearing_imp_f2247 = [col for col in filtered if "f.2247" in col]
hearing_imp_f2257 = [col for col in filtered if "f.2257" in col]

icd_10_cols = [col for col in filtered if "f.41270" in col]
icd_9_cols = [col for col in filtered if "f.41271" in col]

In [129]:
# for hearing impairement questions we code the answers with the following for comparison
hearing_ans = {"Do not know":9, "Yes":1, "No":0}

# will contain the complete set of actual combinations of answers from individuals in the database
options = set()

# pass one pheno at a time to this function
def find_options(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    options.add(answer)

In [130]:
# this builds the options set to contain a set of all the unique answers the individuals in the database have had
# over their hearing impairment questions
hearing_imp_qs = filtered[hearing_imp_f3393]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2247]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2257]
s = hearing_imp_qs.apply(find_options, axis=1)

In [131]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '0009',
 '001',
 '0010',
 '0011',
 '0019',
 '009',
 '0090',
 '0091',
 '0099',
 '01',
 '010',
 '0100',
 '0101',
 '0109',
 '011',
 '0110',
 '0111',
 '0119',
 '019',
 '0190',
 '0191',
 '0199',
 '09',
 '090',
 '0900',
 '0901',
 '0909',
 '091',
 '0910',
 '0911',
 '0919',
 '099',
 '0990',
 '0991',
 '0999',
 '1',
 '10',
 '100',
 '1000',
 '1001',
 '1009',
 '101',
 '1010',
 '1011',
 '1019',
 '109',
 '1090',
 '1099',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '1119',
 '119',
 '1190',
 '1191',
 '19',
 '190',
 '1900',
 '1901',
 '191',
 '1910',
 '1911',
 '199',
 '9',
 '90',
 '900',
 '9000',
 '9001',
 '9009',
 '901',
 '9011',
 '909',
 '9090',
 '9099',
 '91',
 '910',
 '911',
 '9110',
 '9111',
 '9119',
 '919',
 '9191',
 '99',
 '990',
 '9901',
 '991',
 '9911',
 '9919',
 '999',
 '9990',
 '9991',
 '9999'}

In [132]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removingthe answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [133]:
inconsistent

['0090',
 '0009',
 '9990',
 '099',
 '090',
 '009',
 '909',
 '0900',
 '09',
 '9090',
 '0999',
 '0990',
 '9099',
 '9009',
 '0099',
 '0909',
 '919',
 '9911',
 '1191',
 '1119',
 '1911',
 '119',
 '9991',
 '9191',
 '191',
 '9919',
 '199',
 '19',
 '9119',
 '100',
 '1110',
 '010',
 '101',
 '1011',
 '1000',
 '0101',
 '0110',
 '10',
 '1001',
 '0100',
 '1101',
 '1100',
 '110',
 '1010',
 '0010',
 '910',
 '109',
 '0901',
 '9110',
 '1900',
 '1019',
 '1009',
 '0019',
 '1910',
 '1901',
 '1090',
 '0109',
 '9901',
 '0119',
 '1099',
 '0191',
 '190',
 '0199',
 '0910',
 '1190',
 '0190',
 '0919',
 '019']

### Filtering out the data

In [134]:
# return True if you find an individual that has NA for every answer in each column
def find_empty(row):
    for i in row:
        if not pd.isna(i):
            return False
    return True

In [135]:
# return True if an individual only answers Do not know but never Yes or No
def find_dont_know(row):
    temp = []
    for i in row:
        if not pd.isna(i):
            temp.append(i)
        
    if "Do not know" in temp and "Yes" not in temp and "No" not in temp:
        return True
    return False

In [136]:
# will return true if that row should be removed
# pass one pheno at a time through the function
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

# if we have individuals that either don't answer or prefer not to say only then we cancel them out
# return true if all the rows have no definitive answers
def find_all_none(row):
    for i in row:
        if not pd.isna(i) and (i == "Yes" or i == "No"):
            return False
    return True

In [137]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [138]:
filtered['f.41281.0.0']

1               <NA>
2               <NA>
3               <NA>
4         1981-01-14
5               <NA>
             ...    
455540          <NA>
455541          <NA>
455542          <NA>
455543          <NA>
455544          <NA>
Name: f.41281.0.0, Length: 441194, dtype: string

In [139]:
# filter out inconsistencies for f3393
hearing_imp_qs = filtered[hearing_imp_f3393]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [140]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [141]:
# filter out inconsistencies for f2247
hearing_imp_qs = filtered[hearing_imp_f2247]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [142]:
# filter out inconsistencies for f2257
hearing_imp_qs = filtered[hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [143]:
# filter out individuals that don't have a definiteive answer for any hearing aid questions
hearing_imp_qs = filtered[hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_all_none, axis=1)
filtered = filtered[~exclude]

In [144]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.250,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,Not Sure
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,Not Sure
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,Not Sure
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,Not Sure
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,Not Sure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,Not Sure
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,Not Sure
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,Not Sure
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,Not Sure


In [145]:
saved_2_filtered = filtered

In [146]:
filtered = saved_2_filtered

## Identify Pure Controls

Need to make sure that for f.3393, f.2247, and f.2257 we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all f.3393, f.2247, and f.2257). However these individuals can still be part of the cases

In [147]:
# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return 0
    return 1

# returns 0 if it's a ctrl or else 1
# this is specific for f3393 because f3393 was only asked under certain circumstances
def find_ctrl_or_NA(row):
    for i in row:
        if not pd.isna(i) and i != "No" and i != "Prefer not to answer": # if we have any answers that are not NA or No only then we don't have a ctrl
            return 1
    return 0


In [148]:
# filter through the hearing impairment questions to find the controls
hearing_imp_qs = filtered[hearing_imp_f3393]
f3393_ctrl = hearing_imp_qs.apply(find_ctrl_or_NA, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2247]
f2247_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2257]
f2257_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()

In [149]:
pure_ctrl = [0 if i == 0 and f2247_ctrl[en] == 0 and f2257_ctrl[en] == 0 else 1 for en, i in enumerate(f3393_ctrl)]

In [150]:
print(len(pure_ctrl) - sum(pure_ctrl), "individuals are controls prior to filtration for icd10, icd9 and self-reported codes")

236230 individuals are controls prior to filtration for icd10, icd9 and self-reported codes


### Collect ICD 10 codes to filter out from Ctrl

In [151]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
97,f.41270,H83.3 Noise effects on inner ear,24,,,N,Y,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51,,,N,Y,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33,,,N,Y,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721,679.0,289.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185,164.0,94.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880,813.0,336.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133,118.0,60.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75,67.0,34.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115,106.0,48.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
113,f.41270,H91.1 Presbycusis,408,,,N,Y,,,


In [152]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H918',
 'H919',
 'H932',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [153]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.249,f.41270.0.250,f.41270.0.251,f.41270.0.252,f.41270.0.253,f.41270.0.254,f.41270.0.255,f.41270.0.256,f.41270.0.257,f.41270.0.258
1,F101,J342,R619,S8280,W010,,,,,,...,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,...,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,A049,A099,A419,B370,C859,D649,E278,E279,E538,E559,...,,,,,,,,,,
455541,,,,,,,,,,,...,,,,,,,,,,
455542,O149,O266,O342,O471,O48,O610,O680,Z370,,,...,,,,,,,,,,
455543,G551,G558,I10,M501,Z981,,,,,,...,,,,,,,,,,


In [154]:
# collect the individuals who should not be part of controls because of icd 10 codes
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### Collect ICD 9 codes to filter out from Ctrl

In [155]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
371,f.41271,3880 Degenerative and vascular disorders of ear,0,,,N,Y,,,
372,f.41271,3881 Noise effects on inner ear,0,,,N,Y,,,
373,f.41271,"3882 Sudden hearing loss, unspecified",0,,,N,Y,,,
374,f.41271,3883 Tinnitus,11,,,N,Y,,,
375,f.41271,3884 Other abnormal auditory perception,0,,,N,Y,,,
379,f.41271,3888 Other specified disorders of ear,1,,,N,Y,,,
380,f.41271,"3889 Disorders of ear, unspecified",2,,,N,Y,,,
383,f.41271,3891 Sensorineural deafness,6,,,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
384,f.41271,3892 Mixed conductive and sensorineural deafness,1,,,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo
385,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1,,,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo


In [156]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3883',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [157]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,3000,5198.0,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,,,,,,,,,,,...,,,,,,,,,,
455541,,,,,,,,,,,...,,,,,,,,,,
455542,,,,,,,,,,,...,,,,,,,,,,
455543,,,,,,,,,,,...,,,,,,,,,,


In [158]:
# collect the individuals who should not be part of controls because of icd 9 codes
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### Collect f20002 codes to filter out from Ctrl

In [159]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,White_EU_After_PCA_clean,less than 55 years,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257
538,f.20002,1415 ear/vestibular disorder,4561,4228,3923,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo


In [160]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
1,1065.0,,,,,,,,,,...,,,,,,,,,,
2,1396.0,1473.0,,,,,,,,,...,,,,,,,,,,
3,1065.0,1294.0,1476.0,1473.0,1374.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1387.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,1464.0,,,,,,,,,,...,,,,,,,,,,
455541,1478.0,1473.0,,,,,,,,,...,,,,,,,,,,
455542,,,,,,,,,,,...,,,,,,,,,,
455543,1265.0,,,,,,,,,,...,,,,,,,,,,


In [161]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

['1415']

In [162]:
# collect the individuals who should not be part of controls because of self-reported codes
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### Filter out the HI Ctrl

In [163]:
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

In [164]:
# because individuals that are controls are labeled as 0
# temp says True if an individual is not a control and False if it is a control
# ex_10, ex_9, and ex_f20002 are True for individuals that are not controls and False for individuals that are controls
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

# we set the control as 0 for each individual that is False in temp
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [165]:
print(len(filtered_ctrl) - sum(filtered_ctrl), "individuals are controls after addition filtration for icd10, icd9 and self-reported codes")

234172 individuals are controls after addition filtration for icd10, icd9 and self-reported codes


In [166]:
filtered["hearing_imp_pure_ctrl"] = filtered_ctrl

  filtered["hearing_imp_pure_ctrl"] = filtered_ctrl


In [167]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test,hearing_imp_pure_ctrl
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,Not Sure,1
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,Not Sure,1
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,Not Sure,1
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,Not Sure,1
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,Not Sure,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455540,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,Not Sure,1
455541,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,Not Sure,0
455542,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,Not Sure,0
455543,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,Not Sure,0


In [168]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [169]:
saved_3_filtered = filtered

In [170]:
filtered = saved_3_filtered

In [171]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,age_f.41280.0.251,age_f.41280.0.252,age_f.41280.0.253,age_f.41280.0.254,age_f.41280.0.255,age_f.41280.0.256,age_f.41280.0.257,age_f.41280.0.258,test,hearing_imp_pure_ctrl
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,,Not Sure,1
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,Not Sure,1
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,,Not Sure,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,,Not Sure,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,,Not Sure,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427004,455540,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,,Not Sure,1
427005,455541,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,,Not Sure,0
427006,455542,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,,Not Sure,0
427007,455543,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,,Not Sure,0


## Identify All Age and Phenotype Columns

In [172]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [173]:
ages_f131258_col = [col.strip('"') for col in filtered if 'f.131258' in col]
ages_f131258_col

['f.131258.0.0']

In [174]:
filtered[ages_f21003_col]

Unnamed: 0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
0,53,,,
1,63,,,
2,62,,73.0,
3,65,,,
4,43,,,
...,...,...,...,...
427004,67,,,
427005,61,66.0,72.0,
427006,49,,,
427007,44,,,


In [175]:
filtered[ages_f131258_col]

Unnamed: 0,f.131258.0.0
0,
1,
2,
3,
4,
...,...
427004,
427005,
427006,
427007,


In [176]:
# get the latest time that an individual said no to any of the phenotypes
# return the oldest age that they were
def get_ctrl_age(row):
    phens = [hearing_imp_f3393, hearing_imp_f2247, hearing_imp_f2257]
    ages = []
    if row["hearing_imp_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i == "No":
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

# get the earliest time that an individual said yes to having a phenotype
def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[ages_f21003_col][en]
    else:
        return pd.NA
    
# return the minium age in the row, or else return NA
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA
    

In [177]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i == "Yes":
            return 1
    return 0

# return 1 if we have a match for the mendelian traits and have at least one of the hearing phenotypes
def find_medelian_like(row):
    mendelian_icd10 = ["H903", "H905", "H906", "H908", "H913", "H918", "H919"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]

    if 1 in row[["f3393", "f2247", "f2257"]].to_list():
        for en, i in enumerate(row[icd_10_cols]):
            if not pd.isna(i) and i in mendelian_icd10:
                return 1, get_ages_from_birth(row[icd10_ages[en:en+1]+year_of_birth+month_of_birth])
        for en, i in enumerate(row[icd_9_cols]):
            if not pd.isna(i) and i in mendelian_icd9:
                return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
    return 0, pd.NA

# return a 0 if the individual is not a case and 1 if they are a case
def find_exclusions(row):
    mendelian_icd10 = ["H903", "H904", "H905", "H906", "H907", "H908"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]

    try:
        if 1 == row[0]: # the first column will be one of the phenotypes, "f3393", "f2247", or "f2257"
            for en, i in enumerate(row[icd_10_cols]):
                if not pd.isna(i) and i in mendelian_icd10:
                    if get_ages_from_birth(row[ages_f131258_col+year_of_birth+month_of_birth]) <= 55: # row[1] must be the age of the phenotype
                        return 0, pd.NA
                    return 1, get_ages_from_birth(row[ages_f131258_col+year_of_birth+month_of_birth])
            for en, i in enumerate(row[icd_9_cols]):
                if not pd.isna(i) and i in mendelian_icd9:
                    if get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth]) <= 55:
                        return 0, pd.NA
                    return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
        return int(row[0]), row[1] # the second column should be the ages of that phenotype
    except:
        print(row["name"])


# return 1 if we have a match for the other cases of f3393 or originally had f3393
def find_f3393_other_cases(row):
    icd10 = ["Z461", "Z974"]
    icd9 = ["V412", "V532"]
    if 0 == int(row["f3393"]):
        for en, i in enumerate(row[icd_10_cols]):
            if not pd.isna(i) and i in icd10:
                return 1, get_ages_from_birth(row[icd10_ages[en:en+1]+year_of_birth+month_of_birth])
        for en, i in enumerate(row[icd_9_cols]):
            if not pd.isna(i) and i in icd9:
                return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
    return int(row["f3393"]), row["f3393_age"]

# check if the h919 code exists in the individual
def check_code(row):
    for i in row:
        if not pd.isna(i) and i == "H919":
            return 1
    return 0

In [178]:
def get_ages_from_birth(row):
    month_dict = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}
    year = 0
    month = 0
    i = row[0]
    if not pd.isna(i):
        i = i.split("-")
        year = i[0]
        month = i[1]
    if int(month) >= month_dict[row[month_of_birth[0]]]:
        return int(year) - int(row[year_of_birth[0]])
    return int(year) - int(row[year_of_birth[0]]) - 1

In [179]:
# f3393
hearing_imp_qs = filtered[hearing_imp_f3393]
filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f3393", "f3393_age"]] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')
filtered[["f3393", "f3393_age"]] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + year_of_birth + month_of_birth].apply(find_f3393_other_cases, axis=1, result_type='expand')

  filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)


In [180]:
# check if we have f3393 ages
filtered[(filtered["f3393"] == 1) & (pd.isna(filtered["f3393_age"])) & (pd.isna(filtered[ages_f131258_col[0]]))][["f3393", "f3393_age"] + ages_f131258_col]

Unnamed: 0,f3393,f3393_age,f.131258.0.0


In [181]:
# f2247
hearing_imp_qs = filtered[hearing_imp_f2247]
filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f2247", "f2247_age"]] = filtered[["f2247", "f2247_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')

  filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)


In [182]:
# check if we have f2247 ages
filtered[(filtered["f2247"] == 1) & (pd.isna(filtered["f2247_age"]))][["f2247", "f2247_age"]]

Unnamed: 0,f2247,f2247_age


In [183]:
# f2257
hearing_imp_qs = filtered[hearing_imp_f2257]
filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f2257", "f2257_age"]] = filtered[["f2257", "f2257_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')

  filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)


In [184]:
# check if we have empty f2257 ages
filtered[(filtered["f2257"] == 1) & (pd.isna(filtered["f2257_age"]))][["f2257", "f2257_age"]]

Unnamed: 0,f2257,f2257_age


In [185]:
# ctrl age
filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)

  filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)


In [186]:
# check if we have empty ctrl ages
filtered[(filtered["hearing_imp_pure_ctrl"] == 0) & (pd.isna(filtered["ctrl_age"]))][["hearing_imp_pure_ctrl", "ctrl_age"]]

Unnamed: 0,hearing_imp_pure_ctrl,ctrl_age


In [187]:
# f2247_f2257
filtered["f2247_f2257"] =  filtered["f2247"] + filtered["f2257"]
filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)
filtered["f2247_f2257"] 

  filtered["f2247_f2257"] =  filtered["f2247"] + filtered["f2257"]
  filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)


0         2
1         1
2         1
3         1
4         0
         ..
427004    1
427005    0
427006    0
427007    0
427008    0
Name: f2247_f2257, Length: 427009, dtype: int64

In [188]:
# check if we have empty f2247_f2257 ages
filtered[(filtered["f2247_f2257"] == 2) & (pd.isna(filtered["f2247_f2257_age"]))][["f2247_f2257", "f2247_f2257_age"]]

Unnamed: 0,f2247_f2257,f2247_f2257_age


In [189]:
mask = filtered["f2247_f2257"] == 2
Both = filtered[mask]
Both

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,hearing_imp_pure_ctrl,f3393,f3393_age,f2247,f2247_age,f2257,f2257_age,ctrl_age,f2247_f2257,f2247_f2257_age
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,1,0,,1,53,1,53,,2,53
9,10,1000112,1000112,Male,1949,June,Do not know,,Yes,,...,1,1,68.0,1,68.0,1,68.0,,2,68.0
14,15,1000170,1000170,Female,1956,April,Yes,,,,...,1,0,,1,51,1,51,,2,51
16,17,1000198,1000198,Female,1967,July,Yes,,,,...,1,0,,1,41,1,41,,2,41
17,18,1000203,1000203,Female,1959,September,Yes,,,,...,1,0,,1,49,1,49,,2,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426972,455506,6024983,6024983,Male,1941,May,Yes,,,,...,1,0,,1,67,1,67,,2,67
426973,455507,6024997,6024997,Female,1950,April,Yes,,,,...,1,0,,1,58,1,58,,2,58
426983,455517,6025128,6025128,Female,1944,November,Yes,,,,...,1,0,,1,64,1,64,,2,64
426984,455518,6025134,6025134,Male,1944,May,Yes,,,,...,1,1,64,1,64,1,64,,2,64


## 5.4. File Output

In [190]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,hearing_imp_pure_ctrl,f3393,f3393_age,f2247,f2247_age,f2257,f2257_age,ctrl_age,f2247_f2257,f2247_f2257_age
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,1,0,,1,53,1,53,,2,53
1,2,1000035,1000035,Male,1944,May,No,,,,...,1,0,,0,,1,63,,1,63
2,3,1000046,1000046,Female,1946,March,No,,No,,...,1,0,,0,,1,73.0,,1,73.0
3,4,1000054,1000054,Female,1942,January,No,,,,...,1,0,,0,,1,65,,1,65
4,5,1000063,1000063,Male,1967,April,No,,,,...,0,0,,0,,0,,43,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427004,455540,6025390,6025390,Female,1942,March,No,,,,...,1,0,,0,,1,67,,1,67
427005,455541,6025409,6025409,Female,1946,November,No,No,,,...,0,0,,0,,0,,66.0,0,
427006,455542,6025411,6025411,Female,1960,November,No,,,,...,0,0,,0,,0,,49,0,
427007,455543,6025425,6025425,Female,1963,August,No,,,,...,0,0,,0,,0,,44,0,


In [192]:
filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "f.22001.0.0", "hearing_imp_pure_ctrl", "ctrl_age"]]

Unnamed: 0,FID,IID,f.22001.0.0,hearing_imp_pure_ctrl,ctrl_age
4,1000063,1000063,Male,0,43
5,1000078,1000078,Female,0,60.0
6,1000081,1000081,Male,0,67
10,1000129,1000129,Male,0,62
11,1000137,1000137,Female,0,46
...,...,...,...,...,...
427002,6025363,6025363,Male,0,64
427005,6025409,6025409,Female,0,66.0
427006,6025411,6025411,Female,0,49
427007,6025425,6025425,Female,0,44


In [193]:
filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "f.22001.0.0", "hearing_imp_pure_ctrl", "ctrl_age"]].to_csv("pure_ctrl_pheno_file_DMCS.tsv", sep='\t', index=False)

In [194]:
ctl = filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "f.22001.0.0", "hearing_imp_pure_ctrl", "ctrl_age"]]
ctl

Unnamed: 0,FID,IID,f.22001.0.0,hearing_imp_pure_ctrl,ctrl_age
4,1000063,1000063,Male,0,43
5,1000078,1000078,Female,0,60.0
6,1000081,1000081,Male,0,67
10,1000129,1000129,Male,0,62
11,1000137,1000137,Female,0,46
...,...,...,...,...,...
427002,6025363,6025363,Male,0,64
427005,6025409,6025409,Female,0,66.0
427006,6025411,6025411,Female,0,49
427007,6025425,6025425,Female,0,44


In [195]:
ctl['sex'] = 1 
ctl.loc[(ctl['f.22001.0.0'] == 'Female') , 'sex'] = 2
ctl

Unnamed: 0,FID,IID,f.22001.0.0,hearing_imp_pure_ctrl,ctrl_age,sex
4,1000063,1000063,Male,0,43,1
5,1000078,1000078,Female,0,60.0,2
6,1000081,1000081,Male,0,67,1
10,1000129,1000129,Male,0,62,1
11,1000137,1000137,Female,0,46,2
...,...,...,...,...,...,...
427002,6025363,6025363,Male,0,64,1
427005,6025409,6025409,Female,0,66.0,2
427006,6025411,6025411,Female,0,49,2
427007,6025425,6025425,Female,0,44,2


In [196]:
ctl_2 = ctl[["FID", "IID", "hearing_imp_pure_ctrl", "ctrl_age", "sex"]]
ctl_2

Unnamed: 0,FID,IID,hearing_imp_pure_ctrl,ctrl_age,sex
4,1000063,1000063,0,43,1
5,1000078,1000078,0,60.0,2
6,1000081,1000081,0,67,1
10,1000129,1000129,0,62,1
11,1000137,1000137,0,46,2
...,...,...,...,...,...
427002,6025363,6025363,0,64,1
427005,6025409,6025409,0,66.0,2
427006,6025411,6025411,0,49,2
427007,6025425,6025425,0,44,2


In [197]:
ctl_2.to_csv("pure_ctrl_pheno_file_DMCS.tsv", sep='\t', index=False)

In [198]:
ctl_2.isnull().sum()

FID                      0
IID                      0
hearing_imp_pure_ctrl    0
ctrl_age                 0
sex                      0
dtype: int64

In [199]:
ctl_2['ctrl_age'].min()

'38'

In [200]:
ctl_2['ctrl_age'].value_counts(dropna=False)

61      10746
62      10156
60       9892
63       9284
64       9020
        ...  
39          3
43.0        3
38          1
72          1
71          1
Name: ctrl_age, Length: 77, dtype: int64

In [201]:
filtered[filtered["f3393"] == 1][["FID", "IID", "f.22001.0.0", "f3393", "f3393_age"]]

Unnamed: 0,FID,IID,f.22001.0.0,f3393,f3393_age
9,1000112,1000112,Male,1,68.0
43,1000551,1000551,Male,1,83.0
84,1001067,1001067,Male,1,50
114,1001384,1001384,Female,1,61
120,1001459,1001459,Male,1,64
...,...,...,...,...,...
426726,6022050,6022050,Female,1,52
426772,6022567,6022567,Male,1,73.0
426824,6023186,6023186,Male,1,67
426857,6023568,6023568,Female,1,68


In [202]:
filtered[filtered["f2247"] == 1][["FID", "IID", "f.22001.0.0", "f2247", "f2247_age"]]

Unnamed: 0,FID,IID,f.22001.0.0,f2247,f2247_age
0,1000022,1000022,Male,1,53
7,1000090,1000090,Female,1,64
9,1000112,1000112,Male,1,68.0
14,1000170,1000170,Female,1,51
16,1000198,1000198,Female,1,41
...,...,...,...,...,...
426972,6024983,6024983,Male,1,67
426973,6024997,6024997,Female,1,58
426983,6025128,6025128,Female,1,64
426984,6025134,6025134,Male,1,64


In [203]:
filtered[filtered["f2257"] == 1][["FID", "IID", "f.22001.0.0", "f2257", "f2257_age"]]

Unnamed: 0,FID,IID,f.22001.0.0,f2257,f2257_age
0,1000022,1000022,Male,1,53
1,1000035,1000035,Male,1,63
2,1000046,1000046,Female,1,73.0
3,1000054,1000054,Female,1,65
8,1000105,1000105,Female,1,54
...,...,...,...,...,...
426998,6025322,6025322,Female,1,63
427000,6025346,6025346,Female,1,68.0
427001,6025354,6025354,Female,1,52
427003,6025378,6025378,Male,1,42


In [204]:
filtered[filtered["f2247_f2257"] == 2][["FID", "IID", "f.22001.0.0", "f2247_f2257", "f2247_f2257_age"]]

Unnamed: 0,FID,IID,f.22001.0.0,f2247_f2257,f2247_f2257_age
0,1000022,1000022,Male,2,53
9,1000112,1000112,Male,2,68.0
14,1000170,1000170,Female,2,51
16,1000198,1000198,Female,2,41
17,1000203,1000203,Female,2,49
...,...,...,...,...,...
426972,6024983,6024983,Male,2,67
426973,6024997,6024997,Female,2,58
426983,6025128,6025128,Female,2,64
426984,6025134,6025134,Male,2,64


In [None]:
## remove IDs from cases who used medication based on Francescia report

In [205]:
medication = pd.read_csv("/mnt/vast/hpc/csg/en2509/Phenotype/IDs_used_medication_removed_ARHL.csv", quotechar = '"', dtype="int")
medication

Unnamed: 0,IID
0,1003108
1,1004336
2,1007149
3,1007824
4,1008132
...,...
4035,5799083
4036,5800395
4037,5948042
4038,2517668


In [206]:
Both = filtered[filtered["f2247_f2257"] == 2][["FID", "IID", "f.22001.0.0", "f2247_f2257", "f2247_f2257_age"]]
Both_clean = Both[~Both.loc[:,'IID'].isin(medication['IID'])]
Both_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2247_f2257,f2247_f2257_age
0,1000022,1000022,Male,2,53
9,1000112,1000112,Male,2,68.0
14,1000170,1000170,Female,2,51
16,1000198,1000198,Female,2,41
17,1000203,1000203,Female,2,49
...,...,...,...,...,...
426972,6024983,6024983,Male,2,67
426973,6024997,6024997,Female,2,58
426983,6025128,6025128,Female,2,64
426984,6025134,6025134,Male,2,64


In [207]:
Both_clean['sex'] = 1 
Both_clean.loc[(Both_clean['f.22001.0.0'] == 'Female') , 'sex'] = 2
Both_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2247_f2257,f2247_f2257_age,sex
0,1000022,1000022,Male,2,53,1
9,1000112,1000112,Male,2,68.0,1
14,1000170,1000170,Female,2,51,2
16,1000198,1000198,Female,2,41,2
17,1000203,1000203,Female,2,49,2
...,...,...,...,...,...,...
426972,6024983,6024983,Male,2,67,1
426973,6024997,6024997,Female,2,58,2
426983,6025128,6025128,Female,2,64,2
426984,6025134,6025134,Male,2,64,1


In [208]:
Both_clean['both'] = 1 
Both_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2247_f2257,f2247_f2257_age,sex,both
0,1000022,1000022,Male,2,53,1,1
9,1000112,1000112,Male,2,68.0,1,1
14,1000170,1000170,Female,2,51,2,1
16,1000198,1000198,Female,2,41,2,1
17,1000203,1000203,Female,2,49,2,1
...,...,...,...,...,...,...,...
426972,6024983,6024983,Male,2,67,1,1
426973,6024997,6024997,Female,2,58,2,1
426983,6025128,6025128,Female,2,64,2,1
426984,6025134,6025134,Male,2,64,1,1


In [210]:
aid = filtered[filtered["f3393"] == 1][["FID", "IID", "f.22001.0.0", "f3393", "f3393_age"]]
aid_clean = aid[~aid.loc[:,'IID'].isin(medication['IID'])]
aid_clean

Unnamed: 0,FID,IID,f.22001.0.0,f3393,f3393_age
9,1000112,1000112,Male,1,68.0
43,1000551,1000551,Male,1,83.0
84,1001067,1001067,Male,1,50
114,1001384,1001384,Female,1,61
120,1001459,1001459,Male,1,64
...,...,...,...,...,...
426726,6022050,6022050,Female,1,52
426772,6022567,6022567,Male,1,73.0
426824,6023186,6023186,Male,1,67
426857,6023568,6023568,Female,1,68


In [211]:
aid_clean['sex'] = 1 
aid_clean.loc[(aid_clean['f.22001.0.0'] == 'Female') , 'sex'] = 2
aid_clean

Unnamed: 0,FID,IID,f.22001.0.0,f3393,f3393_age,sex
9,1000112,1000112,Male,1,68.0,1
43,1000551,1000551,Male,1,83.0,1
84,1001067,1001067,Male,1,50,1
114,1001384,1001384,Female,1,61,2
120,1001459,1001459,Male,1,64,1
...,...,...,...,...,...,...
426726,6022050,6022050,Female,1,52,2
426772,6022567,6022567,Male,1,73.0,1
426824,6023186,6023186,Male,1,67,1
426857,6023568,6023568,Female,1,68,2


In [212]:
diff = filtered[filtered["f2247"] == 1][["FID", "IID", "f.22001.0.0", "f2247", "f2247_age"]]
diff_clean = diff[~diff.loc[:,'IID'].isin(medication['IID'])]
diff_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2247,f2247_age
0,1000022,1000022,Male,1,53
7,1000090,1000090,Female,1,64
9,1000112,1000112,Male,1,68.0
14,1000170,1000170,Female,1,51
16,1000198,1000198,Female,1,41
...,...,...,...,...,...
426972,6024983,6024983,Male,1,67
426973,6024997,6024997,Female,1,58
426983,6025128,6025128,Female,1,64
426984,6025134,6025134,Male,1,64


In [220]:
diff_clean['sex'] = 1 
diff_clean.loc[(diff_clean['f.22001.0.0'] == 'Female') , 'sex'] = 2
diff_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2247,f2247_age,sex
0,1000022,1000022,Male,1,53,1
7,1000090,1000090,Female,1,64,2
9,1000112,1000112,Male,1,68.0,1
14,1000170,1000170,Female,1,51,2
16,1000198,1000198,Female,1,41,2
...,...,...,...,...,...,...
426972,6024983,6024983,Male,1,67,1
426973,6024997,6024997,Female,1,58,2
426983,6025128,6025128,Female,1,64,2
426984,6025134,6025134,Male,1,64,1


In [213]:
noise = filtered[filtered["f2257"] == 1][["FID", "IID", "f.22001.0.0", "f2257", "f2257_age"]]
noise_clean = noise[~noise.loc[:,'IID'].isin(medication['IID'])]
noise_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2257,f2257_age
0,1000022,1000022,Male,1,53
1,1000035,1000035,Male,1,63
2,1000046,1000046,Female,1,73.0
3,1000054,1000054,Female,1,65
8,1000105,1000105,Female,1,54
...,...,...,...,...,...
426998,6025322,6025322,Female,1,63
427000,6025346,6025346,Female,1,68.0
427001,6025354,6025354,Female,1,52
427003,6025378,6025378,Male,1,42


In [214]:
noise_clean['sex'] = 1 
noise_clean.loc[(noise_clean['f.22001.0.0'] == 'Female') , 'sex'] = 2
noise_clean

Unnamed: 0,FID,IID,f.22001.0.0,f2257,f2257_age,sex
0,1000022,1000022,Male,1,53,1
1,1000035,1000035,Male,1,63,1
2,1000046,1000046,Female,1,73.0,2
3,1000054,1000054,Female,1,65,2
8,1000105,1000105,Female,1,54,2
...,...,...,...,...,...,...
426998,6025322,6025322,Female,1,63,2
427000,6025346,6025346,Female,1,68.0,2
427001,6025354,6025354,Female,1,52,2
427003,6025378,6025378,Male,1,42,1


In [215]:
Both_clean_2 = Both_clean[["FID", "IID", "both", "f2247_f2257_age", "sex"]]
Both_clean_2

Unnamed: 0,FID,IID,both,f2247_f2257_age,sex
0,1000022,1000022,1,53,1
9,1000112,1000112,1,68.0,1
14,1000170,1000170,1,51,2
16,1000198,1000198,1,41,2
17,1000203,1000203,1,49,2
...,...,...,...,...,...
426972,6024983,6024983,1,67,1
426973,6024997,6024997,1,58,2
426983,6025128,6025128,1,64,2
426984,6025134,6025134,1,64,1


In [216]:
Both_clean_2.to_csv("f2247_f2257_pheno_file_DMCS.tsv", sep='\t', index=False)

In [217]:
aid_clean_2 = aid_clean [["FID", "IID", "f3393", "f3393_age", "sex"]]
aid_clean_2

Unnamed: 0,FID,IID,f3393,f3393_age,sex
9,1000112,1000112,1,68.0,1
43,1000551,1000551,1,83.0,1
84,1001067,1001067,1,50,1
114,1001384,1001384,1,61,2
120,1001459,1001459,1,64,1
...,...,...,...,...,...
426726,6022050,6022050,1,52,2
426772,6022567,6022567,1,73.0,1
426824,6023186,6023186,1,67,1
426857,6023568,6023568,1,68,2


In [218]:
aid_clean_2.to_csv("f3393_pheno_file_DMCS.tsv", sep='\t', index=False)

In [221]:
diff_clean_2 = diff_clean [["FID", "IID", "f2247", "f2247_age", "sex"]]
diff_clean_2

Unnamed: 0,FID,IID,f2247,f2247_age,sex
0,1000022,1000022,1,53,1
7,1000090,1000090,1,64,2
9,1000112,1000112,1,68.0,1
14,1000170,1000170,1,51,2
16,1000198,1000198,1,41,2
...,...,...,...,...,...
426972,6024983,6024983,1,67,1
426973,6024997,6024997,1,58,2
426983,6025128,6025128,1,64,2
426984,6025134,6025134,1,64,1


In [222]:
diff_clean_2.to_csv("f2247_pheno_file_DMCS.tsv", sep='\t', index=False)

In [223]:
noise_clean_2 = noise_clean [["FID", "IID", "f2257", "f2257_age", "sex"]]
noise_clean_2

Unnamed: 0,FID,IID,f2257,f2257_age,sex
0,1000022,1000022,1,53,1
1,1000035,1000035,1,63,1
2,1000046,1000046,1,73.0,2
3,1000054,1000054,1,65,2
8,1000105,1000105,1,54,2
...,...,...,...,...,...
426998,6025322,6025322,1,63,2
427000,6025346,6025346,1,68.0,2
427001,6025354,6025354,1,52,2
427003,6025378,6025378,1,42,1


In [224]:
noise_clean_2.to_csv("f2257_pheno_file_DMCS.tsv", sep='\t', index=False)

In [147]:
#diff.isnull().sum()

In [148]:
#diff['f2247_f2257_age'].min()

In [149]:
#diff['f2247_age'].value_counts(dropna=False)

## Combine pheno files and control

In [225]:
ctrl_file_name = "pure_ctrl_pheno_file_DMCS.tsv"
f3393_file_name = "f3393_pheno_file_DMCS.tsv"
f2247_file_name = "f2247_pheno_file_DMCS.tsv"
f2257_file_name = "f2257_pheno_file_DMCS.tsv"
f2247_f2257_file_name = "f2247_f2257_pheno_file_DMCS.tsv"

In [226]:
f3393 = pd.read_csv(f3393_file_name, sep="\t")
f2247 = pd.read_csv(f2247_file_name, sep="\t")
f2257 = pd.read_csv(f2257_file_name, sep="\t")
f2247_f2257 = pd.read_csv(f2247_f2257_file_name, sep="\t")

In [245]:
ctrl = pd.read_csv(ctrl_file_name, sep="\t")

In [227]:
print("ctrl: ",len(ctrl))
print("f2247: ",len(f2247))
print("f2257: ",len(f2257))
print("f2247_f2257: ",len(f2247_f2257))
print("f3393: ",len(f3393))

ctrl:  234172
f2247:  112495
f2257:  162855
f2247_f2257:  95380
f3393:  17389


### Final file f2257

In [228]:
f2257

Unnamed: 0,FID,IID,f2257,f2257_age,sex
0,1000022,1000022,1,53.0,1
1,1000035,1000035,1,63.0,1
2,1000046,1000046,1,73.0,2
3,1000054,1000054,1,65.0,2
4,1000105,1000105,1,54.0,2
...,...,...,...,...,...
162850,6025322,6025322,1,63.0,2
162851,6025346,6025346,1,68.0,2
162852,6025354,6025354,1,52.0,2
162853,6025378,6025378,1,42.0,1


In [229]:
ctrl

Unnamed: 0,FID,IID,hearing_imp_pure_ctrl,ctrl_age,sex
0,1000063,1000063,0,43.0,1
1,1000078,1000078,0,60.0,2
2,1000081,1000081,0,67.0,1
3,1000129,1000129,0,62.0,1
4,1000137,1000137,0,46.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [246]:
ctrl_2257 = ctrl.rename(columns={"hearing_imp_pure_ctrl":"f2257", "ctrl_age":"age"})
ctrl_2257

Unnamed: 0,FID,IID,f2257,age,sex
0,1000063,1000063,0,43.0,1
1,1000078,1000078,0,60.0,2
2,1000081,1000081,0,67.0,1
3,1000129,1000129,0,62.0,1
4,1000137,1000137,0,46.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [231]:
f2257 = f2257.rename(columns={"f2257_age":"age"})
f2257

Unnamed: 0,FID,IID,f2257,age,sex
0,1000022,1000022,1,53.0,1
1,1000035,1000035,1,63.0,1
2,1000046,1000046,1,73.0,2
3,1000054,1000054,1,65.0,2
4,1000105,1000105,1,54.0,2
...,...,...,...,...,...
162850,6025322,6025322,1,63.0,2
162851,6025346,6025346,1,68.0,2
162852,6025354,6025354,1,52.0,2
162853,6025378,6025378,1,42.0,1


In [255]:
full_pheno_2257 = f2257.append(ctrl_2257)
full_pheno_2257

Unnamed: 0,FID,IID,f2257,age,sex
0,1000022,1000022,1,53.0,1
1,1000035,1000035,1,63.0,1
2,1000046,1000046,1,73.0,2
3,1000054,1000054,1,65.0,2
4,1000105,1000105,1,54.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [240]:
## reading array info
array = pd.read_csv("/mnt/vast/hpc/csg/en2509/X_chr/UKBB_Genotype_array", sep='\t')
array

Unnamed: 0,FID,IID,array
0,1000019,1000019,1
1,1000022,1000022,1
2,1000035,1000035,1
3,1000046,1000046,1
4,1000054,1000054,2
...,...,...,...
486411,6025390,6025390,1
486412,6025409,6025409,1
486413,6025411,6025411,2
486414,6025425,6025425,1


In [256]:
merged_2257 = pd.merge(left=full_pheno, right=array, left_on=['IID','FID'], right_on=['IID','FID'])
merged_2257

Unnamed: 0,FID,IID,f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000035,1000035,1,63.0,1,1
2,1000046,1000046,1,73.0,2,1
3,1000054,1000054,1,65.0,2,2
4,1000105,1000105,1,54.0,2,1
...,...,...,...,...,...,...
397022,6025363,6025363,0,64.0,1,1
397023,6025409,6025409,0,66.0,2,1
397024,6025411,6025411,0,49.0,2,2
397025,6025425,6025425,0,44.0,2,1


In [257]:
merged_2257[["FID", "IID","f2257", "age", "sex", "array"]].to_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.tsv", sep='\t', index=False)

In [303]:
# Create the phenofile for PCA
merged_2257['ethnicity']='white_european'
merged_2257
merged_2257[["FID", "IID", "ethnicity"]].to_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.phenopca", sep='\t', index=False)

In [None]:
merged_2257[["FID", "IID","f2257", "age", "sex", "array"]].to_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.tsv", sep='\t', index=False)

### Final file f3393

In [247]:
ctrl_3393 = ctrl.rename(columns={"hearing_imp_pure_ctrl":"f3393", "ctrl_age":"age"})
ctrl_3393

Unnamed: 0,FID,IID,f3393,age,sex
0,1000063,1000063,0,43.0,1
1,1000078,1000078,0,60.0,2
2,1000081,1000081,0,67.0,1
3,1000129,1000129,0,62.0,1
4,1000137,1000137,0,46.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [251]:
f3393= f3393.rename(columns={"f3393_age":"age"})
f3393

Unnamed: 0,FID,IID,f3393,age,sex
0,1000112,1000112,1,68.0,1
1,1000551,1000551,1,83.0,1
2,1001067,1001067,1,50.0,1
3,1001384,1001384,1,61.0,2
4,1001459,1001459,1,64.0,1
...,...,...,...,...,...
17384,6022050,6022050,1,52.0,2
17385,6022567,6022567,1,73.0,1
17386,6023186,6023186,1,67.0,1
17387,6023568,6023568,1,68.0,2


In [252]:
full_pheno_3393 = f3393.append(ctrl_3393)
full_pheno_3393

Unnamed: 0,FID,IID,f3393,age,sex
0,1000112,1000112,1,68.0,1
1,1000551,1000551,1,83.0,1
2,1001067,1001067,1,50.0,1
3,1001384,1001384,1,61.0,2
4,1001459,1001459,1,64.0,1
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [253]:
merged_3393 = pd.merge(left=full_pheno_3393, right=array, left_on=['IID','FID'], right_on=['IID','FID'])
merged_3393

Unnamed: 0,FID,IID,f3393,age,sex,array
0,1000112,1000112,1,68.0,1,1
1,1000551,1000551,1,83.0,1,1
2,1001067,1001067,1,50.0,1,1
3,1001384,1001384,1,61.0,2,1
4,1001459,1001459,1,64.0,1,1
...,...,...,...,...,...,...
251556,6025363,6025363,0,64.0,1,1
251557,6025409,6025409,0,66.0,2,1
251558,6025411,6025411,0,49.0,2,2
251559,6025425,6025425,0,44.0,2,1


In [304]:
merged_3393['ethnicity']='white_european'
merged_3393
merged_3393[["FID", "IID" "ethnicity"]].to_csv("072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.phenopca", sep='\t', index=False)

In [254]:
merged_3393[["FID", "IID","f3393", "age", "sex", "array"]].to_csv("072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.tsv", sep='\t', index=False)

### Final file f2247

In [260]:
ctrl_2247 = ctrl.rename(columns={"hearing_imp_pure_ctrl":"f2247", "ctrl_age":"age"})
ctrl_2247

Unnamed: 0,FID,IID,f2247,age,sex
0,1000063,1000063,0,43.0,1
1,1000078,1000078,0,60.0,2
2,1000081,1000081,0,67.0,1
3,1000129,1000129,0,62.0,1
4,1000137,1000137,0,46.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [258]:
f2247= f2247.rename(columns={"f2247_age":"age"})
f2247

Unnamed: 0,FID,IID,f2247,age,sex
0,1000022,1000022,1,53.0,1
1,1000090,1000090,1,64.0,2
2,1000112,1000112,1,68.0,1
3,1000170,1000170,1,51.0,2
4,1000198,1000198,1,41.0,2
...,...,...,...,...,...
112490,6024983,6024983,1,67.0,1
112491,6024997,6024997,1,58.0,2
112492,6025128,6025128,1,64.0,2
112493,6025134,6025134,1,64.0,1


In [261]:
full_pheno_2247 = f2247.append(ctrl_2247)
full_pheno_2247

Unnamed: 0,FID,IID,f2247,age,sex
0,1000022,1000022,1,53.0,1
1,1000090,1000090,1,64.0,2
2,1000112,1000112,1,68.0,1
3,1000170,1000170,1,51.0,2
4,1000198,1000198,1,41.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [262]:
merged_2247 = pd.merge(left=full_pheno_2247, right=array, left_on=['IID','FID'], right_on=['IID','FID'])
merged_2247

Unnamed: 0,FID,IID,f2247,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000090,1000090,1,64.0,2,1
2,1000112,1000112,1,68.0,1,1
3,1000170,1000170,1,51.0,2,2
4,1000198,1000198,1,41.0,2,1
...,...,...,...,...,...,...
346662,6025363,6025363,0,64.0,1,1
346663,6025409,6025409,0,66.0,2,1
346664,6025411,6025411,0,49.0,2,2
346665,6025425,6025425,0,44.0,2,1


In [305]:
merged_2247['ethnicity']='white_european'
merged_2247
merged_2247[["FID", "IID","ethnicity"]].to_csv("072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.phenopca", sep='\t', index=False)

In [264]:
merged_2247[["FID", "IID","f2247", "age", "sex", "array"]].to_csv("072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.tsv", sep='\t', index=False)

### Final file f2247_2257

In [267]:
ctrl_2247_2257 = ctrl.rename(columns={"hearing_imp_pure_ctrl":"f2247_f2257", "ctrl_age":"age"})
ctrl_2247_2257

Unnamed: 0,FID,IID,f2247_f2257,age,sex
0,1000063,1000063,0,43.0,1
1,1000078,1000078,0,60.0,2
2,1000081,1000081,0,67.0,1
3,1000129,1000129,0,62.0,1
4,1000137,1000137,0,46.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [266]:
f2247_f2257 = f2247_f2257.rename(columns={"f2247_f2257_age":"age", "both":"f2247_f2257"})
f2247_f2257

Unnamed: 0,FID,IID,f2247_f2257,age,sex
0,1000022,1000022,1,53.0,1
1,1000112,1000112,1,68.0,1
2,1000170,1000170,1,51.0,2
3,1000198,1000198,1,41.0,2
4,1000203,1000203,1,49.0,2
...,...,...,...,...,...
95375,6024983,6024983,1,67.0,1
95376,6024997,6024997,1,58.0,2
95377,6025128,6025128,1,64.0,2
95378,6025134,6025134,1,64.0,1


In [268]:
full_pheno_2247_2257 = f2247_f2257.append(ctrl_2247_2257)
full_pheno_2247_2257

Unnamed: 0,FID,IID,f2247_f2257,age,sex
0,1000022,1000022,1,53.0,1
1,1000112,1000112,1,68.0,1
2,1000170,1000170,1,51.0,2
3,1000198,1000198,1,41.0,2
4,1000203,1000203,1,49.0,2
...,...,...,...,...,...
234167,6025363,6025363,0,64.0,1
234168,6025409,6025409,0,66.0,2
234169,6025411,6025411,0,49.0,2
234170,6025425,6025425,0,44.0,2


In [269]:
merged_2247_2257 = pd.merge(left=full_pheno_2247_2257, right=array, left_on=['IID','FID'], right_on=['IID','FID'])
merged_2247_2257

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000112,1000112,1,68.0,1,1
2,1000170,1000170,1,51.0,2,2
3,1000198,1000198,1,41.0,2,1
4,1000203,1000203,1,49.0,2,1
...,...,...,...,...,...,...
329547,6025363,6025363,0,64.0,1,1
329548,6025409,6025409,0,66.0,2,1
329549,6025411,6025411,0,49.0,2,2
329550,6025425,6025425,0,44.0,2,1


In [306]:
merged_2247_2257['ethnicity']='white_european'
merged_2247_2257
merged_2247_2257[["FID", "IID","ethnicity"]].to_csv("072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.phenopca", sep='\t', index=False)

In [270]:
merged_2247_2257[["FID", "IID","f2247_f2257", "age", "sex", "array"]].to_csv("072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.tsv", sep='\t', index=False)

## Seperate Males and Females

### Hnoise

In [271]:
merged_2257 = pd.read_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_397027ind.tsv", sep='\t')
merged_2257

Unnamed: 0,FID,IID,f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000035,1000035,1,63.0,1,1
2,1000046,1000046,1,73.0,2,1
3,1000054,1000054,1,65.0,2,2
4,1000105,1000105,1,54.0,2,1
...,...,...,...,...,...,...
397022,6025363,6025363,0,64.0,1,1
397023,6025409,6025409,0,66.0,2,1
397024,6025411,6025411,0,49.0,2,2
397025,6025425,6025425,0,44.0,2,1


In [279]:
males = merged_2257["sex"] == 1

In [280]:
Hnoise_males = merged_2257[males]
Hnoise_males

Unnamed: 0,FID,IID,f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000035,1000035,1,63.0,1,1
5,1000112,1000112,1,68.0,1,1
11,1000318,1000318,1,52.0,1,2
12,1000396,1000396,1,48.0,1,1
...,...,...,...,...,...,...
397017,6025251,6025251,0,56.0,1,1
397018,6025268,6025268,0,65.0,1,2
397021,6025335,6025335,0,62.0,1,1
397022,6025363,6025363,0,64.0,1,1


In [281]:
Hnoise_males[["FID", "IID","f2257", "age", "array"]].to_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_males_180590ind.tsv", sep='\t', index=False)

In [282]:
females = merged_2257["sex"] ==2
Hnoise_females = merged_2257[females]
Hnoise_females

Unnamed: 0,FID,IID,f2257,age,sex,array
2,1000046,1000046,1,73.0,2,1
3,1000054,1000054,1,65.0,2,2
4,1000105,1000105,1,54.0,2,1
6,1000164,1000164,1,51.0,2,1
7,1000170,1000170,1,51.0,2,2
...,...,...,...,...,...,...
397019,6025307,6025307,0,55.0,2,2
397020,6025319,6025319,0,56.0,2,1
397023,6025409,6025409,0,66.0,2,1
397024,6025411,6025411,0,49.0,2,2


In [283]:
Hnoise_females[["FID", "IID","f2257", "age", "array"]].to_csv("072423_UKBB_Hnoise_f2257_expandedwhite_DMCS_females_216437ind.tsv", sep='\t', index=False)

### Haid

In [284]:
merged_3393 = pd.read_csv("072423_UKBB_Haid_f3393_expandedwhite_DMCS_251561ind.tsv", sep='\t')
merged_3393

Unnamed: 0,FID,IID,f3393,age,sex,array
0,1000112,1000112,1,68.0,1,1
1,1000551,1000551,1,83.0,1,1
2,1001067,1001067,1,50.0,1,1
3,1001384,1001384,1,61.0,2,1
4,1001459,1001459,1,64.0,1,1
...,...,...,...,...,...,...
251556,6025363,6025363,0,64.0,1,1
251557,6025409,6025409,0,66.0,2,1
251558,6025411,6025411,0,49.0,2,2
251559,6025425,6025425,0,44.0,2,1


In [285]:
males = merged_3393["sex"] == 1
Haid_males = merged_3393[males]
Haid_males

Unnamed: 0,FID,IID,f3393,age,sex,array
0,1000112,1000112,1,68.0,1,1
1,1000551,1000551,1,83.0,1,1
2,1001067,1001067,1,50.0,1,1
4,1001459,1001459,1,64.0,1,1
6,1002216,1002216,1,62.0,1,2
...,...,...,...,...,...,...
251551,6025251,6025251,0,56.0,1,1
251552,6025268,6025268,0,65.0,1,2
251555,6025335,6025335,0,62.0,1,1
251556,6025363,6025363,0,64.0,1,1


In [301]:
Haid_males[["FID", "IID","f3393", "age", "array"]].to_csv("072423_UKBB_Haid_f3393_expandedwhite_DMCS_males_103695ind.tsv", sep='\t', index=False)

In [286]:
females = merged_3393["sex"] ==2
Haid_females = merged_3393[females]
Haid_females

Unnamed: 0,FID,IID,f3393,age,sex,array
3,1001384,1001384,1,61.0,2,1
5,1001941,1001941,1,69.0,2,1
8,1002806,1002806,1,72.0,2,1
12,1003570,1003570,1,73.0,2,1
13,1004012,1004012,1,57.0,2,1
...,...,...,...,...,...,...
251553,6025307,6025307,0,55.0,2,2
251554,6025319,6025319,0,56.0,2,1
251557,6025409,6025409,0,66.0,2,1
251558,6025411,6025411,0,49.0,2,2


In [287]:
Haid_females[["FID", "IID","f3393", "age", "array"]].to_csv("072423_UKBB_Haid_f3393_expandedwhite_DMCS_females_147866ind.tsv", sep='\t', index=False)

### Hdiff

In [289]:
merged_2247 = pd.read_csv("072423_UKBB_Hdiff_2247_expandedwhite_DMCS_346667ind.tsv", sep='\t')
merged_2247

Unnamed: 0,FID,IID,f2247,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000090,1000090,1,64.0,2,1
2,1000112,1000112,1,68.0,1,1
3,1000170,1000170,1,51.0,2,2
4,1000198,1000198,1,41.0,2,1
...,...,...,...,...,...,...
346662,6025363,6025363,0,64.0,1,1
346663,6025409,6025409,0,66.0,2,1
346664,6025411,6025411,0,49.0,2,2
346665,6025425,6025425,0,44.0,2,1


In [290]:
males = merged_2247["sex"] == 1
Hdiff_males = merged_2247[males]
Hdiff_males

Unnamed: 0,FID,IID,f2247,age,sex,array
0,1000022,1000022,1,53.0,1,1
2,1000112,1000112,1,68.0,1,1
6,1000396,1000396,1,48.0,1,1
7,1000494,1000494,1,61.0,1,1
8,1000549,1000549,1,63.0,1,1
...,...,...,...,...,...,...
346657,6025251,6025251,0,56.0,1,1
346658,6025268,6025268,0,65.0,1,2
346661,6025335,6025335,0,62.0,1,1
346662,6025363,6025363,0,64.0,1,1


In [291]:
Hdiff_males[["FID", "IID","f2247", "age", "array"]].to_csv("072423_UKBB_Hdiff_2247_expandedwhite_DMCS_males_156117ind.tsv", sep='\t', index=False)

In [292]:
females = merged_2247["sex"] ==2
Hdiff_females = merged_2247[females]
Hdiff_females

Unnamed: 0,FID,IID,f2247,age,sex,array
1,1000090,1000090,1,64.0,2,1
3,1000170,1000170,1,51.0,2,2
4,1000198,1000198,1,41.0,2,1
5,1000203,1000203,1,49.0,2,1
14,1001052,1001052,1,64.0,2,1
...,...,...,...,...,...,...
346659,6025307,6025307,0,55.0,2,2
346660,6025319,6025319,0,56.0,2,1
346663,6025409,6025409,0,66.0,2,1
346664,6025411,6025411,0,49.0,2,2


In [294]:
Hdiff_females[["FID", "IID","f2247", "age", "array"]].to_csv("072423_UKBB_Hdiff_2247_expandedwhite_DMCS_females_190550ind.tsv", sep='\t', index=False)

### Hboth

In [295]:
merged_2247_2257 = pd.read_csv("072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_329552ind.tsv", sep='\t')
merged_2247_2257

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000112,1000112,1,68.0,1,1
2,1000170,1000170,1,51.0,2,2
3,1000198,1000198,1,41.0,2,1
4,1000203,1000203,1,49.0,2,1
...,...,...,...,...,...,...
329547,6025363,6025363,0,64.0,1,1
329548,6025409,6025409,0,66.0,2,1
329549,6025411,6025411,0,49.0,2,2
329550,6025425,6025425,0,44.0,2,1


In [296]:
males = merged_2247_2257["sex"] == 1
Hboth_males = merged_2247_2257[males]
Hboth_males

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array
0,1000022,1000022,1,53.0,1,1
1,1000112,1000112,1,68.0,1,1
5,1000396,1000396,1,48.0,1,1
6,1000494,1000494,1,61.0,1,1
7,1000549,1000549,1,63.0,1,1
...,...,...,...,...,...,...
329542,6025251,6025251,0,56.0,1,1
329543,6025268,6025268,0,65.0,1,2
329546,6025335,6025335,0,62.0,1,1
329547,6025363,6025363,0,64.0,1,1


In [297]:
Hboth_males[["FID", "IID","f2247_f2257", "age", "array"]].to_csv("072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_males_147404ind.tsv", sep='\t', index=False)

In [299]:
females = merged_2247_2257["sex"] ==2
Hboth_females = merged_2247_2257[females]
Hboth_females

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array
2,1000170,1000170,1,51.0,2,2
3,1000198,1000198,1,41.0,2,1
4,1000203,1000203,1,49.0,2,1
14,1001099,1001099,1,62.0,2,1
15,1001123,1001123,1,62.0,2,1
...,...,...,...,...,...,...
329544,6025307,6025307,0,55.0,2,2
329545,6025319,6025319,0,56.0,2,1
329548,6025409,6025409,0,66.0,2,1
329549,6025411,6025411,0,49.0,2,2


In [300]:
Hboth_females[["FID", "IID","f2247_f2257", "age", "array"]].to_csv("072423_UKBB_Hboth_2247_2257_expandedwhite_DMCS_females_182148ind.tsv", sep='\t', index=False)

# After this section belongs to Elnaz other codes

## Making pheno files for unrelated white European individuals 

In [8]:
Hdiff = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hdiff_F2247_4PCs_All.pheno', delimiter=' ')
Hdiff

Unnamed: 0,FID,IID,f2247,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000090,1000090,1,64,2,1,-0.017105,0.017367,-0.040881,0.004871
...,...,...,...,...,...,...,...,...,...,...
346094,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
346095,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
346096,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
346097,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [3]:
Unrelated = pd.read_csv('/mnt/vast/hpc/csg/en2509/Rsq_imputation/European/Unrelated_EU_afterPCA_updated_353088ind.txt', delimiter='\t')
Unrelated

Unnamed: 0,FID,IID
0,1000022,1000022
1,1000046,1000046
2,1000063,1000063
3,1000078,1000078
4,1000081,1000081
...,...,...
353083,6020607,6020607
353084,6022039,6022039
353085,6023046,6023046
353086,6023549,6023549


In [9]:
Hnoise = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hnoise_F2257_4PCs_All.pheno', delimiter=' ')
Hnoise

Unnamed: 0,FID,IID,f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000046,1000046,1,73,2,1,-0.009498,0.004089,-0.012498,0.008941
2,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
3,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
4,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
...,...,...,...,...,...,...,...,...,...,...
397145,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
397146,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
397147,6025322,6025322,1,63,2,1,-0.003041,0.028747,-0.009047,0.003363
397148,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [10]:
Hboth = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hboth_F2247_F2257_4PCs_All.pheno', delimiter=' ')
Hboth

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
...,...,...,...,...,...,...,...,...,...,...
329117,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
329118,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
329119,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
329120,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [11]:
Haid = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Haid_F3393_4PCs_All.pheno', delimiter=' ')
Haid

Unnamed: 0,FID,IID,f3393,age,sex,array,PC1,PC2,PC3,PC4
0,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
1,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
2,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
3,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
4,1000129,1000129,0,62,1,2,-0.005777,-0.005378,0.010749,-0.003741
...,...,...,...,...,...,...,...,...,...,...
252480,6024692,6024692,0,46,1,1,0.004719,-0.008683,-0.000753,0.004104
252481,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
252482,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
252483,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [13]:
Unrelated_Hdiff = Hdiff[Hdiff.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Hdiff

Unnamed: 0,FID,IID,f2247,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000090,1000090,1,64,2,1,-0.017105,0.017367,-0.040881,0.004871
...,...,...,...,...,...,...,...,...,...,...
346025,6020607,6020607,1,62,1,1,-0.001649,0.000177,0.016048,-0.002361
346048,6022039,6022039,1,57,2,1,0.011899,-0.009931,-0.003741,-0.006151
346063,6023046,6023046,0,40,2,1,0.009782,0.001445,0.016083,-0.020262
346070,6023549,6023549,0,65,1,1,0.011020,-0.012357,0.020477,0.003696


In [14]:
Unrelated_Hnoise = Hnoise[Hnoise.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Hnoise

Unnamed: 0,FID,IID,f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000046,1000046,1,73,2,1,-0.009498,0.004089,-0.012498,0.008941
2,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
3,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
4,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
...,...,...,...,...,...,...,...,...,...,...
397064,6020607,6020607,1,62,1,1,-0.001649,0.000177,0.016048,-0.002361
397093,6022039,6022039,1,57,2,1,0.011899,-0.009931,-0.003741,-0.006151
397107,6023046,6023046,0,40,2,1,0.009782,0.001445,0.016083,-0.020262
397117,6023549,6023549,0,65,1,1,0.011020,-0.012357,0.020477,0.003696


In [15]:
Unrelated_Hboth = Hboth[Hboth.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Hboth

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
...,...,...,...,...,...,...,...,...,...,...
329053,6020607,6020607,1,62,1,1,-0.001649,0.000177,0.016048,-0.002361
329076,6022039,6022039,1,57,2,1,0.011899,-0.009931,-0.003741,-0.006151
329089,6023046,6023046,0,40,2,1,0.009782,0.001445,0.016083,-0.020262
329095,6023549,6023549,0,65,1,1,0.011020,-0.012357,0.020477,0.003696


In [16]:
Unrelated_Haid = Haid[Haid.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Haid

Unnamed: 0,FID,IID,f3393,age,sex,array,PC1,PC2,PC3,PC4
0,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
1,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
2,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
3,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
4,1000129,1000129,0,62,1,2,-0.005777,-0.005378,0.010749,-0.003741
...,...,...,...,...,...,...,...,...,...,...
252258,6003694,6003694,1,67,1,1,0.002073,-0.000447,-0.004327,-0.007727
252416,6018137,6018137,0,57,2,1,0.014297,-0.003259,0.001209,0.007020
252455,6022039,6022039,1,57,2,1,0.011899,-0.009931,-0.003741,-0.006151
252464,6023046,6023046,0,40,2,1,0.009782,0.001445,0.016083,-0.020262


In [7]:
Tinnitus = pd.read_csv('/mnt/mfs/hgrcgrid/homes/fr2540/projects/tinnitus/PCA/tinnitus_case_group3_0222_5081cases_99351controls_related_unrelated.pheno', sep='\t')
Tinnitus

Unnamed: 0,FID,IID,sex,age,tinnitus_3,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,array
0,1000046,1000046,1,73,0,-0.007985,0.004271,-0.013161,0.005613,-0.009311,-0.009297,0.000614,-0.001315,-0.001588,0.000533,1
1,1000063,1000063,0,43,0,-0.003617,0.002522,0.013547,0.002792,-0.009610,-0.010373,0.011888,0.012609,-0.010592,0.003954,1
2,1000078,1000078,1,60,0,-0.008100,0.003432,0.010375,-0.006071,-0.007506,-0.010840,-0.001616,0.009411,-0.004137,0.012528,1
3,1000081,1000081,0,67,0,0.118704,-0.048038,-0.002625,0.012228,0.006972,-0.002569,0.007648,-0.004643,-0.000582,0.011976,1
4,1000141,1000141,1,49,0,-0.001355,0.001002,0.003562,0.003884,-0.014152,-0.010774,0.010278,0.016399,0.002039,0.001612,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104427,6024002,6024002,0,52,0,0.015383,-0.022075,-0.008029,-0.005885,0.005767,0.000471,0.003523,-0.000987,0.005845,0.002428,1
104428,6024614,6024614,1,63,0,-0.000132,0.011875,-0.004452,0.005759,-0.013814,0.007971,-0.003415,-0.002461,0.007312,-0.013765,1
104429,6024620,6024620,0,55,1,-0.002454,0.004290,0.014269,0.008715,0.002799,0.001548,-0.001065,0.002623,0.007900,0.005889,2
104430,6024870,6024870,1,53,0,-0.011629,-0.000451,-0.009712,0.000519,-0.007135,-0.006879,0.009101,0.013579,0.007336,0.009927,1


In [8]:
Unrelated_Tinnitus = Tinnitus[Tinnitus.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Tinnitus

Unnamed: 0,FID,IID,sex,age,tinnitus_3,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,array
0,1000046,1000046,1,73,0,-0.007985,0.004271,-0.013161,0.005613,-0.009311,-0.009297,0.000614,-0.001315,-0.001588,0.000533,1
1,1000063,1000063,0,43,0,-0.003617,0.002522,0.013547,0.002792,-0.009610,-0.010373,0.011888,0.012609,-0.010592,0.003954,1
2,1000078,1000078,1,60,0,-0.008100,0.003432,0.010375,-0.006071,-0.007506,-0.010840,-0.001616,0.009411,-0.004137,0.012528,1
3,1000081,1000081,0,67,0,0.118704,-0.048038,-0.002625,0.012228,0.006972,-0.002569,0.007648,-0.004643,-0.000582,0.011976,1
4,1000141,1000141,1,49,0,-0.001355,0.001002,0.003562,0.003884,-0.014152,-0.010774,0.010278,0.016399,0.002039,0.001612,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104029,5933453,5933453,1,60,0,0.003846,-0.004126,-0.000154,-0.006543,0.001768,-0.001251,-0.000040,-0.005595,-0.009682,-0.002248,1
104162,5963216,5963216,1,63,0,0.025315,-0.029438,-0.004490,0.005427,0.000668,0.010613,-0.005363,0.011105,0.003907,0.003028,1
104226,5976649,5976649,1,48,0,-0.005287,0.008702,-0.000494,0.007539,-0.001097,0.011666,0.012358,0.004873,0.003849,0.009754,1
104239,5979222,5979222,1,42,0,0.007609,-0.010344,-0.016789,0.003753,0.006667,0.012006,0.006291,-0.000754,0.003086,0.004259,1


In [2]:
Tinnitus_All = pd.read_csv('/mnt/mfs/hgrcgrid/homes/fr2540/projects/tinnitus/PCA/tinnitus_case_group1_all_53202cases_128931controls_related_unrelated.pheno', sep='\t')
Tinnitus_All

Unnamed: 0,FID,IID,sex,age,tinnitus_1,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,array
0,1000046,1000046,1,73,0,-0.007985,0.004271,-0.013161,0.005613,-0.009311,-0.009297,0.000614,-0.001315,-0.001588,0.000533,1
1,1000063,1000063,0,43,0,-0.003617,0.002522,0.013547,0.002792,-0.009610,-0.010373,0.011888,0.012609,-0.010592,0.003954,1
2,1000078,1000078,1,60,0,-0.008100,0.003432,0.010375,-0.006071,-0.007506,-0.010840,-0.001616,0.009411,-0.004137,0.012528,1
3,1000081,1000081,0,67,0,0.118704,-0.048038,-0.002625,0.012228,0.006972,-0.002569,0.007648,-0.004643,-0.000582,0.011976,1
4,1000090,1000090,1,64,0,-0.015511,0.019004,-0.041280,-0.000970,-0.015544,-0.010922,0.012069,0.012576,0.001526,-0.003160,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182128,6024614,6024614,1,63,0,-0.000132,0.011875,-0.004452,0.005759,-0.013814,0.007971,-0.003415,-0.002461,0.007312,-0.013765,1
182129,6024620,6024620,0,55,1,-0.002454,0.004290,0.014269,0.008715,0.002799,0.001548,-0.001065,0.002623,0.007900,0.005889,2
182130,6024671,6024671,0,64,1,-0.001764,-0.001129,-0.005330,0.000998,-0.011287,0.004729,-0.012297,0.000264,-0.005895,-0.009035,1
182131,6024870,6024870,1,53,0,-0.011629,-0.000451,-0.009712,0.000519,-0.007135,-0.006879,0.009101,0.013579,0.007336,0.009927,1


In [4]:
Unrelated_Tinnitus_All = Tinnitus_All[Tinnitus_All.loc[:,'IID'].isin(Unrelated['IID'])]
Unrelated_Tinnitus_All

Unnamed: 0,FID,IID,sex,age,tinnitus_1,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,array
0,1000046,1000046,1,73,0,-0.007985,0.004271,-0.013161,0.005613,-0.009311,-0.009297,0.000614,-0.001315,-0.001588,0.000533,1
1,1000063,1000063,0,43,0,-0.003617,0.002522,0.013547,0.002792,-0.009610,-0.010373,0.011888,0.012609,-0.010592,0.003954,1
2,1000078,1000078,1,60,0,-0.008100,0.003432,0.010375,-0.006071,-0.007506,-0.010840,-0.001616,0.009411,-0.004137,0.012528,1
3,1000081,1000081,0,67,0,0.118704,-0.048038,-0.002625,0.012228,0.006972,-0.002569,0.007648,-0.004643,-0.000582,0.011976,1
4,1000090,1000090,1,64,0,-0.015511,0.019004,-0.041280,-0.000970,-0.015544,-0.010922,0.012069,0.012576,0.001526,-0.003160,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181956,6003694,6003694,0,67,1,-0.000311,0.001018,0.000409,-0.000617,0.016143,-0.012850,0.004391,-0.000704,0.003083,0.005408,1
182017,6010960,6010960,1,54,1,0.008671,-0.019938,0.022205,-0.039419,0.001856,0.025617,-0.005797,0.010526,0.008437,0.008018,1
182078,6018780,6018780,1,53,1,0.002949,0.002163,-0.008237,0.003069,-0.006749,0.010805,0.009847,-0.007844,-0.010709,0.009259,1
182090,6020607,6020607,0,62,0,-0.001818,0.000777,0.011854,0.003908,0.006565,-0.000488,0.004228,0.001599,0.011280,0.002901,1


In [9]:
Unrelated_Tinnitus.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Unrelated_Tinnitus_case_group3_Originally_5081cases_99351controls.pheno", sep = '\t', index=False)

In [5]:
Unrelated_Tinnitus_All.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Tinnitus_All_case_group1_Originally_53202cases_128931controls.pheno", sep = '\t', index=False)

In [17]:
Unrelated_Haid.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Haid_F3393_4PCs_All.pheno", sep = '\t', index=False)

In [18]:
Unrelated_Hboth.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Hboth_F2247_F2257_4PCs_All.pheno", sep = '\t', index=False)

In [19]:
Unrelated_Hdiff.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Hdiff_F2247_4PCs_All.pheno", sep = '\t', index=False)

In [20]:
Unrelated_Hnoise.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/output/Unrelated_Inds/Unrelated_White_EU_Hnoise_F2257_4PCs_All.pheno", sep = '\t', index=False)

## Select Discovery and replication sets based on white British and other white individulas among white Europeans

In [2]:
### keep white European 
white = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.pheno", sep = '\t', engine = 'python')

In [3]:
white

Unnamed: 0,FID,IID,ethnicity
0,1000019,1000019,British
1,1000022,1000022,British
2,1000035,1000035,British
3,1000046,1000046,British
4,1000054,1000054,British
...,...,...,...
460644,6025390,6025390,British
460645,6025409,6025409,British
460646,6025411,6025411,British
460647,6025425,6025425,British


In [4]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/092821_PCA_related_pval0.005/ukb47922_white_460649ind.092821_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1003423,1003423
1,1008606,1008606
2,1009852,1009852
3,1010412,1010412
4,1010678,1010678
...,...,...
1377,5801962,5801962
1378,5807807,5807807
1379,5809112,5809112
1380,5833189,5833189


In [6]:
outlier.columns =['FID', 'IID']

In [7]:
White_Filtered_PCA = white[~white.loc[:,'IID'].isin(outlier['IID'])]
White_Filtered_PCA

Unnamed: 0,FID,IID,ethnicity
0,1000019,1000019,British
1,1000022,1000022,British
2,1000035,1000035,British
3,1000046,1000046,British
4,1000054,1000054,British
...,...,...,...
460644,6025390,6025390,British
460645,6025409,6025409,British
460646,6025411,6025411,British
460647,6025425,6025425,British


In [8]:
category_counts = White_Filtered_PCA['ethnicity'].value_counts()
print("Number of individuals per category:")
print(category_counts)

Number of individuals per category:
British                       428900
Any_other_white_background     15091
Irish                          12516
Unknown                         1655
Inconsistent_white               584
White                            521
Name: ethnicity, dtype: int64


### 428,900 British and 30,367 other white

In [9]:
subset_British = White_Filtered_PCA[White_Filtered_PCA['ethnicity'] == 'British'][['IID', 'FID']]
subset_British

Unnamed: 0,IID,FID
0,1000019,1000019
1,1000022,1000022
2,1000035,1000035
3,1000046,1000046
4,1000054,1000054
...,...,...
460644,6025390,6025390
460645,6025409,6025409
460646,6025411,6025411
460647,6025425,6025425


In [10]:
subset_Other_White = White_Filtered_PCA[White_Filtered_PCA['ethnicity'] != 'British'][['IID', 'FID']]
subset_Other_White

Unnamed: 0,IID,FID
8,1000090,1000090
29,1000331,1000331
36,1000415,1000415
61,1000701,1000701
68,1000776,1000776
...,...,...
460524,6024002,6024002
460529,6024051,6024051
460534,6024100,6024100
460543,6024208,6024208


In [13]:
#subset_British.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/DiscoveryReplication/IDs_WhiteBritish.csv",index=False)

In [14]:
#subset_Other_White.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/DiscoveryReplication/IDs_OtherWhites.csv",index=False)

In [15]:
Haid = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Haid_F3393_4PCs_All.pheno', delimiter=' ')
Haid

Unnamed: 0,FID,IID,f3393,age,sex,array,PC1,PC2,PC3,PC4
0,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
1,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
2,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
3,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
4,1000129,1000129,0,62,1,2,-0.005777,-0.005378,0.010749,-0.003741
...,...,...,...,...,...,...,...,...,...,...
252480,6024692,6024692,0,46,1,1,0.004719,-0.008683,-0.000753,0.004104
252481,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
252482,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
252483,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [16]:
Hdiff = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hdiff_F2247_4PCs_All.pheno', delimiter=' ')
Hdiff

Unnamed: 0,FID,IID,f2247,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000090,1000090,1,64,2,1,-0.017105,0.017367,-0.040881,0.004871
...,...,...,...,...,...,...,...,...,...,...
346094,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
346095,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
346096,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
346097,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [17]:
Hnoise = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hnoise_F2257_4PCs_All.pheno', delimiter=' ')
Hnoise

Unnamed: 0,FID,IID,f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000046,1000046,1,73,2,1,-0.009498,0.004089,-0.012498,0.008941
2,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
3,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
4,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
...,...,...,...,...,...,...,...,...,...,...
397145,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
397146,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
397147,6025322,6025322,1,63,2,1,-0.003041,0.028747,-0.009047,0.003363
397148,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [18]:
Hboth = pd.read_csv('/mnt/vast/hpc/csg/en2509/Phenotype/ARHL_Phenos_02152023_EN/UKBB_Hboth_F2247_F2257_4PCs_All.pheno', delimiter=' ')
Hboth

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
...,...,...,...,...,...,...,...,...,...,...
329117,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
329118,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
329119,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
329120,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [19]:
subset_IIDs = subset_British['IID'].tolist()  # Convert the "IID" column to a list
Discovery_hnoise = Hnoise[Hnoise['IID'].isin(subset_IIDs)]
Discovery_hnoise

Unnamed: 0,FID,IID,f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000046,1000046,1,73,2,1,-0.009498,0.004089,-0.012498,0.008941
2,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
3,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
4,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
...,...,...,...,...,...,...,...,...,...,...
397145,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
397146,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
397147,6025322,6025322,1,63,2,1,-0.003041,0.028747,-0.009047,0.003363
397148,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [20]:
category_counts = Discovery_hnoise['f2257'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    222487
1    148900
Name: f2257, dtype: int64


In [21]:
columns_to_remove = ['PC1', 'PC2', 'PC3', 'PC4']
df_subset = Discovery_hnoise.drop(columns_to_remove, axis=1)
df_subset

Unnamed: 0,FID,IID,f2257,age,sex,array
0,1000022,1000022,1,53,1,1
1,1000046,1000046,1,73,2,1
2,1000063,1000063,0,43,1,1
3,1000078,1000078,0,60,2,1
4,1000081,1000081,0,67,1,1
...,...,...,...,...,...,...
397145,6025004,6025004,0,70,2,1
397146,6025167,6025167,0,52,2,1
397147,6025322,6025322,1,63,2,1
397148,6025425,6025425,0,44,2,1


In [22]:
df_subset.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/DiscoveryReplication/Hnoise/Discovery/Hnoise_British_Discovery.csv",index=False)

In [23]:
subset_IIDs = subset_Other_White['IID'].tolist()  # Convert the "IID" column to a list
Replication_hnoise = Hnoise[Hnoise['IID'].isin(subset_IIDs)]
Replication_hnoise

Unnamed: 0,FID,IID,f2257,age,sex,array,PC1,PC2,PC3,PC4
19,1000331,1000331,0,53,2,1,0.076316,-0.046557,-0.003439,-0.008151
24,1000415,1000415,0,65,1,1,-0.010352,0.017313,-0.047860,-0.002530
39,1000701,1000701,0,58,2,1,-0.002600,-0.003810,0.011087,0.001300
43,1000799,1000799,1,44,1,1,0.018243,-0.016175,0.014078,-0.004426
47,1000858,1000858,0,61,1,1,0.146847,-0.087013,-0.031188,0.015055
...,...,...,...,...,...,...,...,...,...,...
397054,6020127,6020127,1,62,1,1,0.026480,-0.032234,-0.011307,0.012454
397070,6020940,6020940,0,45,1,1,0.003105,-0.008832,0.048612,-0.000320
397076,6021166,6021166,1,60,2,1,0.029255,-0.042018,0.022066,0.001429
397128,6023920,6023920,0,42,2,1,-0.026426,-0.011967,-0.004445,-0.006127


In [24]:
category_counts = Replication_hnoise['f2257'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    15083
1    10680
Name: f2257, dtype: int64


In [25]:
columns_to_remove = ['PC1', 'PC2', 'PC3', 'PC4']
df_subset = Replication_hnoise.drop(columns_to_remove, axis=1)
df_subset

Unnamed: 0,FID,IID,f2257,age,sex,array
19,1000331,1000331,0,53,2,1
24,1000415,1000415,0,65,1,1
39,1000701,1000701,0,58,2,1
43,1000799,1000799,1,44,1,1
47,1000858,1000858,0,61,1,1
...,...,...,...,...,...,...
397054,6020127,6020127,1,62,1,1
397070,6020940,6020940,0,45,1,1
397076,6021166,6021166,1,60,2,1
397128,6023920,6023920,0,42,2,1


In [26]:
df_subset.to_csv("/mnt/vast/hpc/csg/en2509/autosomal/DiscoveryReplication/Hnoise/Replication/Hnoise_OtherWhite_Replication.csv",index=False)

In [27]:
subset_IIDs = subset_British['IID'].tolist()  # Convert the "IID" column to a list
Discovery_Haid = Haid[Haid['IID'].isin(subset_IIDs)]
Discovery_Haid

Unnamed: 0,FID,IID,f3393,age,sex,array,PC1,PC2,PC3,PC4
0,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
1,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
2,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
3,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
4,1000129,1000129,0,62,1,2,-0.005777,-0.005378,0.010749,-0.003741
...,...,...,...,...,...,...,...,...,...,...
252480,6024692,6024692,0,46,1,1,0.004719,-0.008683,-0.000753,0.004104
252481,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
252482,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
252483,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [28]:
category_counts = Discovery_Haid['f3393'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    222487
1     14167
Name: f3393, dtype: int64


In [29]:
subset_IIDs = subset_British['IID'].tolist()  # Convert the "IID" column to a list
Discovery_Hdiff = Hdiff[Hdiff['IID'].isin(subset_IIDs)]
Discovery_Hdiff

Unnamed: 0,FID,IID,f2247,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
5,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
...,...,...,...,...,...,...,...,...,...,...
346094,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
346095,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
346096,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
346097,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [30]:
category_counts = Discovery_Hdiff['f2247'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    222487
1    101773
Name: f2247, dtype: int64


In [31]:
subset_IIDs = subset_British['IID'].tolist()  # Convert the "IID" column to a list
Discovery_Hboth = Hboth[Hboth['IID'].isin(subset_IIDs)]
Discovery_Hboth

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array,PC1,PC2,PC3,PC4
0,1000022,1000022,1,53,1,1,-0.001246,-0.002730,0.017134,-0.005394
1,1000063,1000063,0,43,1,1,-0.003469,0.002587,0.011446,0.004137
2,1000078,1000078,0,60,2,1,-0.009168,0.000995,0.012004,-0.000638
3,1000081,1000081,0,67,1,1,0.118118,-0.047693,-0.002693,0.010003
4,1000112,1000112,1,68,1,1,-0.006373,0.009384,-0.030781,0.006168
...,...,...,...,...,...,...,...,...,...,...
329117,6024909,6024909,1,49,2,1,-0.012714,0.006905,-0.003829,-0.011212
329118,6025004,6025004,0,70,2,1,0.000891,0.002434,-0.011637,0.005646
329119,6025167,6025167,0,52,2,1,-0.001835,0.013914,-0.001599,-0.004673
329120,6025425,6025425,0,44,2,1,0.019182,-0.014949,-0.008309,0.013414


In [32]:
category_counts = Discovery_Hboth['f2247_f2257'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    222487
1     85875
Name: f2247_f2257, dtype: int64


In [33]:
subset_IIDs = subset_Other_White['IID'].tolist()  # Convert the "IID" column to a list
Replication_Haid = Haid[Haid['IID'].isin(subset_IIDs)]
Replication_Haid

Unnamed: 0,FID,IID,f3393,age,sex,array,PC1,PC2,PC3,PC4
10,1000331,1000331,0,53,2,1,0.076316,-0.046557,-0.003439,-0.008151
14,1000415,1000415,0,65,1,1,-0.010352,0.017313,-0.047860,-0.002530
25,1000701,1000701,0,58,2,1,-0.002600,-0.003810,0.011087,0.001300
29,1000858,1000858,0,61,1,1,0.146847,-0.087013,-0.031188,0.015055
32,1000914,1000914,0,47,1,1,-0.002959,-0.009982,0.026722,-0.003840
...,...,...,...,...,...,...,...,...,...,...
252404,6017262,6017262,0,55,2,1,0.032185,-0.041464,-0.000240,0.006460
252432,6020089,6020089,0,58,1,2,0.026561,-0.038922,0.011565,-0.013214
252442,6020940,6020940,0,45,1,1,0.003105,-0.008832,0.048612,-0.000320
252473,6023920,6023920,0,42,2,1,-0.026426,-0.011967,-0.004445,-0.006127


In [34]:
category_counts = Replication_Haid['f3393'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    15083
1      748
Name: f3393, dtype: int64


In [35]:
subset_IIDs = subset_Other_White['IID'].tolist()  # Convert the "IID" column to a list
Replication_Hdiff = Hdiff[Hdiff['IID'].isin(subset_IIDs)]
Replication_Hdiff

Unnamed: 0,FID,IID,f2247,age,sex,array,PC1,PC2,PC3,PC4
4,1000090,1000090,1,64,2,1,-0.017105,0.017367,-0.040881,0.004871
15,1000331,1000331,0,53,2,1,0.076316,-0.046557,-0.003439,-0.008151
20,1000415,1000415,0,65,1,1,-0.010352,0.017313,-0.047860,-0.002530
34,1000701,1000701,0,58,2,1,-0.002600,-0.003810,0.011087,0.001300
40,1000858,1000858,0,61,1,1,0.146847,-0.087013,-0.031188,0.015055
...,...,...,...,...,...,...,...,...,...,...
345998,6018674,6018674,1,62,2,1,-0.008631,0.010347,0.000801,-0.005217
346015,6020089,6020089,0,58,1,2,0.026561,-0.038922,0.011565,-0.013214
346031,6020940,6020940,0,45,1,1,0.003105,-0.008832,0.048612,-0.000320
346081,6023920,6023920,0,42,2,1,-0.026426,-0.011967,-0.004445,-0.006127


In [36]:
category_counts = Replication_Hdiff['f2247'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    15083
1     6756
Name: f2247, dtype: int64


In [37]:
subset_IIDs = subset_Other_White['IID'].tolist()  # Convert the "IID" column to a list
Replication_Hboth = Hboth[Hboth['IID'].isin(subset_IIDs)]
Replication_Hboth

Unnamed: 0,FID,IID,f2247_f2257,age,sex,array,PC1,PC2,PC3,PC4
14,1000331,1000331,0,53,2,1,0.076316,-0.046557,-0.003439,-0.008151
19,1000415,1000415,0,65,1,1,-0.010352,0.017313,-0.047860,-0.002530
33,1000701,1000701,0,58,2,1,-0.002600,-0.003810,0.011087,0.001300
38,1000858,1000858,0,61,1,1,0.146847,-0.087013,-0.031188,0.015055
41,1000914,1000914,0,47,1,1,-0.002959,-0.009982,0.026722,-0.003840
...,...,...,...,...,...,...,...,...,...,...
329026,6018674,6018674,1,62,2,1,-0.008631,0.010347,0.000801,-0.005217
329043,6020089,6020089,0,58,1,2,0.026561,-0.038922,0.011565,-0.013214
329059,6020940,6020940,0,45,1,1,0.003105,-0.008832,0.048612,-0.000320
329105,6023920,6023920,0,42,2,1,-0.026426,-0.011967,-0.004445,-0.006127


In [38]:
category_counts = Replication_Hboth['f2247_f2257'].value_counts()
print("Number of occurrences per category:")
print(category_counts)

Number of occurrences per category:
0    15083
1     5677
Name: f2247_f2257, dtype: int64


## Below belongs to Fabiha

# 6. Tinnitus

## 6.1. Remove inconsistencies or unclear individuals

### 6.1.1. Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': 76141,
 'Yes, but not now, but have in the past': 11400,
 'Yes, now some of the time': 9788,
 'Yes, now a lot of the time': 2973,
 'Yes, now most or all of the time': 7426,
 'Do not know': 1745,
 'Prefer not to answer': 127}

### 6.1.2. Inconsistencies in the tinnitus answers

In [None]:
filtered = saved_filtered

In [None]:
tin_cols = [col for col in filtered if "f.4803" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [None]:
s = filtered[tin_cols].apply(find_options, axis=1)

In [None]:
options

In [None]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removing the answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [None]:
inconsistent

#### 6.1.2.1. Filtering out the data

In [None]:
filtered

In [None]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [None]:
exclude = filtered[tin_cols].apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [None]:
filtered

## 6.2. Identify Pure Control

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all tinnitus). However these individuals can still be part of the cases

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return False
    return True

In [None]:
f4803_ctrl = filtered[tin_cols].apply(find_ctrl, axis=1)

In [None]:
sum(f4803_ctrl)

### 6.2.1. Collect ICD 10 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

In [None]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

In [None]:
icd10 = filtered[icd10_colnames]
icd10

In [None]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### 6.2.2. Collect ICD 9 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

In [None]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

In [None]:
icd9 = filtered[icd9_colnames]
icd9

In [None]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### 6.2.3. Collect f20002 codes to filter out from Ctrl

In [None]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

In [None]:
f20002 = filtered[f20002_colnames]
f20002

In [None]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

In [None]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### 6.2.4. Collect individuals with other tinnitus codes to filter out from CTRL

In [None]:
# check if the given code exists in the individuals
def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [None]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered[icd_10_cols].apply(tinn_icd10_check_code, axis = 1)

In [None]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered[icd_9_cols].apply(tinn_icd9_check_code, axis = 1)

In [None]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered[self_report_cols].apply(tinn_self_report_check_code, axis = 1)

### 6.2.5. Filter out Tinnitus Ctrl

In [None]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [None]:
filtered["tinnitus_pure_ctrl"] = filtered_ctrl

In [None]:
filtered

## 6.3. Identify Age

In [None]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

In [None]:
#get the minimum age of each individual in the given columns
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA

In [None]:
filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)

In [None]:
saved_tinn = filtered

In [None]:
filtered = saved_tinn

## 6.4. Noisy workplace and Loud Music Variable

Two variables that we need to control for in the analysis are f.4825 (noisy workplace) and f.4836 (loud music).

### 6.4.1. Check for inconsistencies

<b>f.4825 "Have you ever worked in a noisy place where you had to shout to be heard?"</b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

<b>f.4836 "Have you ever listened to music for more than 3 hours per week at a volume which you would need to shout to be heard or, if wearing headphones, someone else would need to shout for you to hear them?" </b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

In [None]:
noise_wp_cols = [col for col in df if "f.4825" in col]
loud_music_cols = [col for col in df if "f.4836" in col]

In [None]:
noise_loud_answers = {"No":0, "Yes, for less than a year":1, "Yes, for around 1-5 years":2, "Yes, for more than 5 years":3}

# if the answers we have are not sorted in the order that they're in the list, then that individual is inconsistent
def find_inconsistencies_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()] 
    return sorted(ans) != ans
        

In [None]:
exclude = filtered[noise_wp_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

In [None]:
exclude = filtered[loud_music_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered = filtered[~exclude]

### 6.4.2. Label Noise and Loud Music

In [None]:
# find individuals that have yes for either noise or loud sounds
def find_label_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()]
    if len(ans) > 0:
        return ans[-1]
    return pd.NA

In [None]:
filtered["noise_wp"] = filtered[noise_wp_cols].apply(find_label_noisy_loud, axis=1)

In [None]:
filtered["noise_wp"] = filtered["noise_wp"].fillna( int(filtered["noise_wp"].median(skipna=True)) )

In [None]:
filtered["loud_music"] = filtered[loud_music_cols].apply(find_label_noisy_loud, axis=1)

In [None]:
filtered["loud_music"] = filtered["loud_music"].fillna( int(filtered["loud_music"].median(skipna=True)) )

In [None]:
filtered

## 6.5. Identify Cases

**Analysis plan:**

1. Individuals who currently have tinnitus (all four yes categories) vs no never only controlling for sex, age, noisy workplace and loud music frequency ("crude") (No tinnitus vs anytype of tinnitus). For this analysis the missing data of the noise variables was imputed using the median for cases and controls separately

3. Individuals in the two top YES categories vs NO never ('No' tinnitus vs 'Yes, now all of the time' and 'Yes, now most of the time')

4. Individuals that say yes in the top 3 categories vs No never (Remove category 'yes, but not now, but have it in the past')

5. Individuals who currently have tinnitus (all four yes categories and tinnitus codes). Not filtering for issues with noisy workplace and loud music.

In [None]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i != "Prefer not to answer" and i in tin_ans.keys() and tin_ans[i] == 1:
            return 1
    return 0

### 6.5.1. Analysis Plan 1

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_1"] = tinn_yes

### 6.5.2. Analysis Plan 2

In [None]:
tin_ans = {"Do not know":9, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_2"] = tinn_yes

### 6.5.3. Analysis Plan 3

In [None]:
tin_ans = {"Do not know":9, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes)

In [None]:
filtered["tinnitus_3"] = tinn_yes

### 6.5.4. Analysis Plan 4

In [None]:
filtered_for4 = saved_tinn

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered_for4[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
filtered_for4["tinnitus_4"] = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report

In [None]:
sum(filtered_for4["tinnitus_4"])

## 6.6. File Output

In [None]:
filtered

In [None]:
filtered_for4

In [None]:
filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)
#filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "ctrl_age"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_1"] == 1][["FID", "IID", "sex", "tinnitus_1", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_1_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_2"] == 1][["FID", "IID", "sex", "tinnitus_2", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_2_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_3"] == 1][["FID", "IID", "sex", "tinnitus_3", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_3_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]].to_csv("tinnitus_4_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]]

# 7. Merge Pheno with Ctrl

## 7.1. f2247, f2257, f3393, and mendilian

In [None]:
ctrl_file_name = "pure_ctrl_pheno_file.tsv"
f3393_file_name = "f3393_pheno_file.tsv"
f2247_file_name = "f2247_pheno_file.tsv"
f2257_file_name = "f2257_pheno_file.tsv"
f2247_f2257_file_name = "f2247_f2257_pheno_file.tsv"
mendilianlike_file_name = "mendelian_pheno_file.tsv"

In [None]:
f3393 = pd.read_csv(f3393_file_name, sep="\t")
f2247 = pd.read_csv(f2247_file_name, sep="\t")
f2257 = pd.read_csv(f2257_file_name, sep="\t")
f2247_f2257 = pd.read_csv(f2247_f2257_file_name, sep="\t")
ctrl = pd.read_csv(ctrl_file_name, sep="\t")
mendlike = pd.read_csv(mendilianlike_file_name, sep="\t")

In [None]:
print("ctrl: ",len(ctrl))
print("f2247: ",len(f2247))
print("f2257: ",len(f2257))
print("f2247_f2257: ",len(f2247_f2257))
print("f3393: ",len(f3393))
print("mendlike: ",len(mendlike))

In [None]:
mendlike[mendlike["mendelian_age"] > 40]

In [None]:
mendlike[mendlike["mendelian_age"] < 18]

### 7.1.1. f2247

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247.columns[3]})
f2247 = f2247.rename(columns={f2247.columns[4]:"age"})

full_pheno = f2247.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2247", "age"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.2. f2257

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2257.columns[3]})
f2257 = f2257.rename(columns={f2257.columns[4]:"age"})

full_pheno = f2257.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2257", "age"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.3. f2247_f2257

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247_f2257.columns[3]})
f2247_f2257 = f2247_f2257.rename(columns={f2247_f2257.columns[4]:"age"})

full_pheno = f2247_f2257.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2247_f2257", "age"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.4. f3393

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f3393.columns[3]})
f3393 = f3393.rename(columns={f3393.columns[4]:"age"})

full_pheno = f3393.append(ctrl)
full_pheno[["FID", "IID", "sex", "f3393", "age"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.5. Mendelian

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:mendlike.columns[3]})
mendlike = mendlike.rename(columns={mendlike.columns[4]:"age"})

full_pheno = mendlike.append(ctrl)
full_pheno[["FID", "IID", "sex", "mendelian", "age"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686rcases_96601ctrl.keep_id", sep='\t', index=False, header=False)

## 7.2. Tinnitus

In [None]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_file.tsv"
tinnitus_1_file_name = "tinnitus_1_pheno_file.tsv"
tinnitus_2_file_name = "tinnitus_2_pheno_file.tsv"
tinnitus_3_file_name = "tinnitus_3_pheno_file.tsv"
tinnitus_4_file_name = "tinnitus_4_pheno_file.tsv"

In [None]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t")
tinnitus_1 = pd.read_csv(tinnitus_1_file_name, sep="\t")
tinnitus_2 = pd.read_csv(tinnitus_2_file_name, sep="\t")
tinnitus_3 = pd.read_csv(tinnitus_3_file_name, sep="\t")
tinnitus_4 = pd.read_csv(tinnitus_4_file_name, sep="\t")

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:"age"})
tinnitus_1 = tinnitus_1.rename(columns={tinnitus_1.columns[4]:"age"})
tinnitus_2 = tinnitus_2.rename(columns={tinnitus_2.columns[4]:"age"})
tinnitus_3 = tinnitus_3.rename(columns={tinnitus_3.columns[4]:"age"})
tinnitus_4 = tinnitus_4.rename(columns={tinnitus_4.columns[4]:"age"})

In [None]:
print("ctrl: ",len(tinnitus_ctrl))
print("tinnitus 1: ",len(tinnitus_1))
print("tinnitus 2: ",len(tinnitus_2))
print("tinnitus 3: ",len(tinnitus_3))
print("tinnitus 4: ",len(tinnitus_4))

### 7.2.1. Analysis 1

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_1.columns[3]})
full_tinnitus_1 = tinnitus_1.append(tinnitus_ctrl)
full_tinnitus_1

### 7.2.2. Analysis 2

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_2.columns[3]})
full_tinnitus_2 = tinnitus_2.append(tinnitus_ctrl)
full_tinnitus_2

### 7.2.3. Analysis 3

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_3.columns[3]})
full_tinnitus_3 = tinnitus_3.append(tinnitus_ctrl)
full_tinnitus_3

### 7.2.4. Analysis 4

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_4.columns[3]})
full_tinnitus_4 = tinnitus_4.append(tinnitus_ctrl[tinnitus_ctrl.columns[:-2]])
full_tinnitus_4