# Generate the phenotypes for the hearing impairment traits from the UKBB for the 500K individuals with imputed data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# 1. Read in the data

## 1.1. Read in database

In [13]:
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
    hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
    hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [3]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + ethnicity + reported_sex + genetic_sex + hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257 + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [None]:
print(datetime.now())

In [4]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


In [None]:
print(datetime.now())

## 1.2. Read in exclusion criteria for icd10, icd9, and self-report

In [5]:
# csv file that contains information on the exclusion criteria for cases and controls
exclusion = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ICD10_9_selfreport_incl_excl.csv")
exclusion

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
0,f.41270,H60-H62 Diseases of external ear,,,,,,,,,
1,f.41270,H60 Otitis externa,,,,,,,,,
2,f.41270,H60.0 Abscess of external ear,32.0,N,N,,,,,,
3,f.41270,H60.1 Cellulitis of external ear,218.0,N,N,,,,,,
4,f.41270,H60.2 Malignant otitis externa,49.0,N,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
566,f.20002,1491 brain haemorrhage,218.0,Y,,,,,,,
567,f.20002,1583 ischaemic stroke,44.0,N,N,,,,,,
568,f.20002,1082 transient ischaemic attack (tia),2243.0,N,N,,,,,,
569,f.20002,1083 subdural haemorrhage/haematoma,212.0,Y,,,,,,,


## 1.3. Read in individuals in genotype array QC

In [6]:
qc_individuals = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/white_europeans/cache/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.fam", sep="\t", header=None)
qc_individuals

Unnamed: 0,0,1,2,3,4,5
0,1000019,1000019,0,0,2,-9
1,1000022,1000022,0,0,1,-9
2,1000035,1000035,0,0,1,-9
3,1000046,1000046,0,0,2,-9
4,1000054,1000054,0,0,2,-9
...,...,...,...,...,...,...
460644,6025390,6025390,0,0,2,-9
460645,6025409,6025409,0,0,2,-9
460646,6025411,6025411,0,0,2,-9
460647,6025425,6025425,0,0,2,-9


## 1.4. Read in PCA outlier file

In [7]:
# outlier individuals that will need to be removed
outlier = pd.read_csv("/mnt/mfs/statgen/UKBiobank/results/092821_PCA_results_500K/092821_PCA_related_pval0.005/ukb47922_white_460649ind.092821_PCA_related_pval0.005.pca.projected.outliers", sep="\t", header=None)
outlier

Unnamed: 0,0,1
0,1003423,1003423
1,1008606,1008606
2,1009852,1009852
3,1010412,1010412
4,1010678,1010678
...,...,...
1377,5801962,5801962
1378,5807807,5807807
1379,5809112,5809112
1380,5833189,5833189


# 2. Sample QC

## 2.1. Remove individuals that do not match for reported and genetic sex

In [8]:
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]

In [9]:
# returns true only if 
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]

In [10]:
# exclusion based on inconsistent sex
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)

In [11]:
filtered = df[~ex_sex]

In [12]:
print(sum(ex_sex), "individuals removed because of inconsistency with the genetic and reported sex variables")

0 individuals removed because of inconsistency with the genetic and reported sex variables


In [13]:
print("Of these individuals", sum([1 for x in df[genetic_sex[0]].to_list() if pd.isna(x)]), "were NA for the genetic sex variable")

Of these individuals 0 were NA for the genetic sex variable


In [14]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,


## 2.2. Remove non-white individuals

In [15]:
# set of answers for the ethnicity question
set(filtered[ethnicity[0]].to_list()).union( set(filtered[ethnicity[1]].to_list()) , set(filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [16]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"

Set an ancestry row that combines the ancestry answers from the database into one 

In [17]:
filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)

  filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)


In [18]:
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white" and row["ethnicity"] != "Any_other_white_background"

In [19]:
ex_non_white = filtered[["ethnicity"]].apply(find_non_white, axis=1)

In [20]:
filtered = filtered[~ex_non_white]

In [21]:
print(sum(ex_non_white), "individuals removed for being non-white")

25767 individuals removed for being non-white


In [22]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


In [23]:
filtered[filtered["ethnicity"] == "British"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


In [24]:
filtered[filtered["ethnicity"] == "Irish"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
8,1000090,1000090,Female,1945,July,Yes,,,,No,...,,,,,,,,,,Irish
37,1000415,1000415,Male,1942,December,No,,,,No,...,,,,,,,,,,Irish
125,1001316,1001316,Male,1964,September,No,,Yes,,No,...,,,,,,,,,,Irish
143,1001492,1001492,Male,1947,August,No,,,,Do not know,...,,,,,,,,,,Irish
195,1002031,1002031,Female,1946,February,No,,,,No,...,,,,,,,,,,Irish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486100,6022183,6022183,Male,1946,June,Yes,,,,No,...,,,,,,,,,,Irish
486165,6022857,6022857,Male,1964,November,No,,,,No,...,,,,,,,,,,Irish
486234,6023551,6023551,Female,1966,February,No,,No,,No,...,,,,,,,,,,Irish
486261,6023832,6023832,Male,1942,June,No,,,,No,...,,,,,,,,,,Irish


In [25]:
filtered[filtered["ethnicity"] == "White"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
850,1008841,1008841,Female,1964,October,Yes,,,,Yes,...,,,,,,,,,,White
1160,1012043,1012043,Male,1943,June,Yes,,,,Yes,...,,,,,,,,,,White
1665,1017312,1017312,Female,1947,March,No,,,,No,...,,,,,,,,,,White
1884,1019550,1019550,Female,1942,July,No,,,,No,...,,,,,,,,,,White
6620,1068621,1068621,Female,1961,January,Prefer not to answer,,,,Prefer not to answer,...,,,,,,,,,,White
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482378,5983819,5983819,Female,1943,October,Yes,,,,Yes,...,,,,,,,,,,White
482627,5986358,5986358,Female,1955,May,No,,,,Yes,...,,,,,,,,,,White
483842,5998866,5998866,Male,1946,March,No,,,,No,...,,,,,,,,,,White
484053,6001050,6001050,Female,1939,May,No,,,,No,...,,,,,,,,,,White


In [26]:
filtered[filtered["ethnicity"] == "Inconsistent_white"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
681,1007061,1007061,Male,1950,May,No,,No,,No,...,,,,,,,,,,Inconsistent_white
2407,1024996,1024996,Female,1939,February,No,Do not know,,,No,...,,,,,,,,,,Inconsistent_white
3999,1041484,1041484,Male,1951,December,No,No,,,No,...,,,,,,,,,,Inconsistent_white
4369,1045313,1045313,Male,1953,August,No,No,,,No,...,,,,,,,,,,Inconsistent_white
5778,1059988,1059988,Male,1950,July,No,No,No,,No,...,,,,,,,,,,Inconsistent_white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482218,5982180,5982180,Male,1967,June,No,Yes,,,No,...,,,,,,,,,,Inconsistent_white
482540,5985476,5985476,Female,1955,October,No,No,No,,Yes,...,,,,,,,,,,Inconsistent_white
483033,5990529,5990529,Female,1949,August,No,No,No,,No,...,,,,,,,,,,Inconsistent_white
484939,6010193,6010193,Female,1948,April,No,No,No,,Yes,...,,,,,,,,,,Inconsistent_white


In [27]:
filtered[filtered["ethnicity"] == "Unknown"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
65,1000701,1000701,Female,1949,October,No,,,,No,...,,,,,,,,,,Unknown
745,1007738,1007738,Female,1944,December,No,,,,No,...,,,,,,,,,,Unknown
856,1008909,1008909,Male,1944,March,No,,,,Yes,...,,,,,,,,,,Unknown
865,1008997,1008997,Male,1945,August,No,,,,No,...,,,,,,,,,,Unknown
948,1009852,1009852,Male,1963,June,Yes,,,,Yes,...,,,,,,,,,,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485524,6016238,6016238,Female,1953,February,No,,,,No,...,,,,,,,,,,Unknown
485740,6018439,6018439,Male,1964,January,No,,,,No,...,,,,,,,,,,Unknown
485763,6018674,6018674,Female,1947,April,Yes,,,,Yes,...,,,,,,,,,,Unknown
486078,6021951,6021951,Male,1967,January,,,,,,...,,,,,,,,,,Unknown


In [28]:
filtered[filtered["ethnicity"] == "Any_other_white_background"]

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
30,1000331,1000331,Female,1956,December,No,,,,No,...,,,,,,,,,,Any_other_white_background
72,1000776,1000776,Female,1946,June,No,,,,Do not know,...,,,,,,,,,,Any_other_white_background
74,1000799,1000799,Male,1963,July,No,,,,Yes,...,,,,,,,,,,Any_other_white_background
80,1000858,1000858,Male,1947,May,No,,,,No,...,,,,,,,,,,Any_other_white_background
86,1000914,1000914,Male,1962,July,No,,,,No,...,,,,,,,,,,Any_other_white_background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486270,6023920,6023920,Female,1967,September,No,,,,No,...,,,,,,,,,,Any_other_white_background
486283,6024051,6024051,Female,1960,February,No,,,,No,...,,,,,,,,,,Any_other_white_background
486288,6024100,6024100,Male,1952,June,Yes,,,,Yes,...,,,,,,,,,,Any_other_white_background
486298,6024208,6024208,Female,1965,November,Yes,,,,No,...,,,,,,,,,,Any_other_white_background


In [29]:
saved_for_number_checking = filtered

In [30]:
filtered = saved_for_number_checking

In [None]:
filtered[["FID","IID","ethnicity"]].to_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/ukb47922_white_460649ind.pheno", sep="\t", index=False)

## 2.3. Only keep individuals that passed genotype array QC

In [None]:
filtered

In [31]:
qc_list = [str(i) for i in qc_individuals[0].to_list()]
def matches_qc_individuals(row):
    return row["FID"] in qc_list
filtered = filtered[filtered[["FID"]].apply(matches_qc_individuals, axis=1)]

# 2. Remove PCA outliers from the full database

Remove the outlier individuals from the full database if there exists any.

In [32]:
# since the IID from the dataframe is in string the outlier ids have to be made into string as well
out_ids = [str(x) for x in outlier[0].to_list()] 

def find_outliers(row):
    return row["IID"] in out_ids

In [33]:
ex_pca_outliers = filtered[["IID", "FID"]].apply(find_outliers, axis=1)

In [34]:
filtered = filtered[~ex_pca_outliers]

In [35]:
print(sum(ex_pca_outliers), "individuals removed for being pca outliers")

1382 individuals removed for being pca outliers


In [36]:
#len(missing_cases - set(df["IID"].to_list()))

In [37]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,...,,,,,,,,,,British
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


# 3. Filter out exclusions from the full database

If individuals have certain codes from ICD 10, ICD 9, and self-reports they must be fully removed from the analysis. 

In [38]:
# returns if the current individual should be excluded based on the exclusion list
def contains_exclusion(row, exclusion_list):
    for i in row:
        if not pd.isna(i) and i in exclusion_list:
            return True
            
    return False

## 3.1. Filter out ICD 10 exclusions

In [39]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in filtered if "f.41270" in col]

In [40]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,...,,,,,,,,,,
1,F101,J342,R619,S8280,W010,,,,,,...,,,,,,,,,,
2,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
3,E780,G473,R065,R074,Z824,,,,,,...,,,,,,,,,,
4,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,...,,,,,,,,,,
486412,,,,,,,,,,,...,,,,,,,,,,
486413,O149,O266,O342,O471,O48,O610,O680,Z370,,,...,,,,,,,,,,
486414,G551,M501,,,,,,,,,...,,,,,,,,,,


In [41]:
# get rows from exclusion database that contian the codes that need to be removed for icd10
exclude_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
27,f.41270,H65.2 Chronic serous otitis media,103.0,Y,,,,,,,
28,f.41270,H65.3 Chronic mucoid otitis media,960.0,Y,,,,,,,
29,f.41270,H65.4 Other chronic nonsuppurative otitis media,158.0,Y,,,,,,,
30,f.41270,"H65.9 Nonsuppurative otitis media, unspecified",508.0,Y,,,,,,,
33,f.41270,H66.1 Chronic tubotympanic suppurative otitis ...,40.0,Y,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
276,f.41270,"S07.9 Crushing injury of head, part unspecified",1.0,Y,,,,,,,
279,f.41270,S08.1 Traumatic amputation of ear,13.0,Y,,,,,,,
280,f.41270,S08.8 Traumatic amputation of other parts of head,1.0,Y,,,,,,,
281,f.41270,S08.9 Traumatic amputation of unspecified part...,1.0,Y,,,,,,,


In [42]:
# get the icd10 codes that should be excluded from database
ex_critia_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd10["Phenotype"].tolist()]
ex_critia_icd10

['H652',
 'H653',
 'H654',
 'H659',
 'H661',
 'H662',
 'H663',
 'H664',
 'H669',
 'H680',
 'H701',
 'H702',
 'H708',
 'H709',
 'H71',
 'H731',
 'H738',
 'H739',
 'H740',
 'H741',
 'H742',
 'H743',
 'H748',
 'H749',
 'H750',
 'H758',
 'H800',
 'H801',
 'H802',
 'H808',
 'H809',
 'H810',
 'H830',
 'H831',
 'H832',
 'H900',
 'H901',
 'H902',
 'H910',
 'H933',
 'H940',
 'H948',
 'H950',
 'H951',
 'H958',
 'H959',
 'B020',
 'B021',
 'B022',
 'B023',
 'B027',
 'B028',
 'G000',
 'G001',
 'G002',
 'G003',
 'G008',
 'G009',
 'G01',
 'G020',
 'G021',
 'G028',
 'G030',
 'G031',
 'G032',
 'G038',
 'G039',
 'G040',
 'G041',
 'G042',
 'G048',
 'G049',
 'G050',
 'G051',
 'G052',
 'G058',
 'G060',
 'G061',
 'G062',
 'G07',
 'G08',
 'G09',
 'G510',
 'G511',
 'G512',
 'G513',
 'G514',
 'G518',
 'G519',
 'S0200',
 'S0201',
 'S0210',
 'S0211',
 'S0240',
 'S0241',
 'S0260',
 'S0261',
 'S0270',
 'S0271',
 'S0280',
 'S0281',
 'S0290',
 'S0291',
 'S045',
 'S046',
 'S049',
 'S0600',
 'S0601',
 'S0610',
 'S0611

In [43]:
# collect the individuals that should be excluded because of icd10
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

In [44]:
# remove them from the working database (which is now filtered. df remains unchanged)
filtered = filtered[~ex_10]

In [45]:
print(sum(ex_10), "individuals removed because of icd10 codes")

12692 individuals removed because of icd10 codes


In [46]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


In [47]:
#len(missing_cases - set(filtered["IID"].to_list()))

## 3.2. Filter out ICD 9 exclusions

In [48]:
# these are the columns that represent the icd9 columns in the working database
icd9_colnames = [col for col in filtered if "f.41271" in col]

In [49]:
# get a dataframe that only contains the icd9 columns from the working database
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,3000,5198,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,,,,,,,,,,,...,,,,,,,,,,
486412,,,,,,,,,,,...,,,,,,,,,,
486413,,,,,,,,,,,...,,,,,,,,,,
486414,,,,,,,,,,,...,,,,,,,,,,


In [50]:
# get rows from exclusion database that contian the codes that need to be removed for icd9
exclude_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
319,f.41271,3811 Chronic serous otitis media,8.0,Y,,,,,,,
320,f.41271,3812 Chronic mucoid otitis media,11.0,Y,,,,,,,
321,f.41271,3813 Other and unspecified chronic nonsuppurat...,3.0,Y,,,,,,,
322,f.41271,"3814 Nonsuppurative otitis media, not specifie...",19.0,Y,,,,,,,
323,f.41271,3815 Eustachian salpingitis,0.0,Y,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
516,f.41271,9050 Late effect of fracture of skull and face...,19.0,Y,,,,,,,
526,f.41271,"9259 Crushing injury of face, scalp and neck",2.0,Y,,,,,,,
532,f.41271,9514 Injury to facial nerve,0.0,Y,,,,,,,
533,f.41271,9515 Injury to acoustic nerve,1.0,Y,,,,,,,


In [51]:
# get the icd9 codes that should be excluded from the working database
ex_critia_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_icd9["Phenotype"].tolist()]
ex_critia_icd9

['3811',
 '3812',
 '3813',
 '3814',
 '3815',
 '3816',
 '3819',
 '3821',
 '3822',
 '3823',
 '3824',
 '3829',
 '3831',
 '3832',
 '3833',
 '3838',
 '3839',
 '3841',
 '3850',
 '3851',
 '3852',
 '3853',
 '3858',
 '3859',
 '3860',
 '3863',
 '3864',
 '3865',
 '3868',
 '3869',
 '3870',
 '3871',
 '3872',
 '3878',
 '3879',
 '3885',
 '3890',
 '0530',
 '0531',
 '0532',
 '0537',
 '0538',
 '3200',
 '3201',
 '3202',
 '3203',
 '3204',
 '3205',
 '3207',
 '3208',
 '3209',
 '3210',
 '3211',
 '3212',
 '3213',
 '3214',
 '3215',
 '3216',
 '3217',
 '3218',
 '3220',
 '3221',
 '3222',
 '3229',
 '3230',
 '3231',
 '3232',
 '3233',
 '3234',
 '3235',
 '3236',
 '3237',
 '3238',
 '3239',
 '3240',
 '3241',
 '3249',
 '3259',
 '3269',
 '3510',
 '3511',
 '3518',
 '3519',
 '8000',
 '8001',
 '8002',
 '8003',
 '8010',
 '8011',
 '8012',
 '8013',
 '8022',
 '8023',
 '8024',
 '8025',
 '8028',
 '8029',
 '8030',
 '8031',
 '8032',
 '8033',
 '8040',
 '8041',
 '8042',
 '8043',
 '8509',
 '8510',
 '8511',
 '8520',
 '8521',
 '8530',
 

In [52]:
# collect the individuals that should be excluded because of icd9
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

In [53]:
# remove them from the working database
filtered = filtered[~ex_9]

In [54]:
print(sum(ex_9), "individuals removed because of icd9 codes")

739 individuals removed because of icd9 codes


In [55]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


In [56]:
#len(missing_cases - set(filtered["IID"].to_list()))

## 3.3. Filter out f.20002 exclusions

In [56]:
# these are the columns that represent the self-report columns in the working database
f20002_colnames = [col for col in filtered if "f.20002" in col]

In [57]:
# get a dataframe that only contains the self-report columns from the working database
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
1,1065,,,,,,,,,,...,,,,,,,,,,
2,1396,1473,,,,,,,,,...,,,,,,,,,,
3,1065,1294,1476,1473,1374,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1387,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,1464,,,,,,,,,,...,,,,,,,,,,
486412,1478,1473,,,,,,,,,...,,,,,,,,,,
486413,,,,,,,,,,,...,,,,,,,,,,
486414,1265,,,,,,,,,,...,,,,,,,,,,


In [58]:
# get rows from exclusion database that contian the codes that need to be removed for self-report
exclude_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_fulldb_lateonsetHI"] == 'Y') ]
exclude_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
539,f.20002,1420 otosclerosis,260.0,Y,,,,,,,
540,f.20002,1421 meniere's disease,1553.0,Y,,,,,,,
541,f.20002,1499 labyrinthitis,417.0,Y,,,,,,,
545,f.20002,1244 infection of nervous system,55.0,Y,,,,,,,
546,f.20002,1245 brain abscess/intracranial abscess,79.0,Y,,,,,,,
547,f.20002,1246 encephalitis,348.0,Y,,,,,,,
548,f.20002,1247 meningitis,2214.0,Y,,,,,,,
550,f.20002,1249 cranial nerve problem/palsy,289.0,Y,,,,,,,
551,f.20002,1250 bell's palsy/facial nerve palsy,591.0,Y,,,,,,,
553,f.20002,1240 neurological injury/trauma,130.0,Y,,,,,,,


In [59]:
# get the self-report codes that should be excluded from the working database
ex_critia_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_f20002["Phenotype"].tolist()]
ex_critia_f20002

['1420',
 '1421',
 '1499',
 '1244',
 '1245',
 '1246',
 '1247',
 '1249',
 '1250',
 '1240',
 '1626',
 '1086',
 '1491',
 '1083',
 '1425']

In [60]:
# collect the individuals that should be excluded because of self-report
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

In [61]:
# remove them from the working database
filtered = filtered[~ex_f20002]

In [62]:
print(sum(ex_f20002), "individuals removed because of self-reported codes")

5571 individuals removed because of self-reported codes


In [63]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,,British
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,,British
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,,British
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,,British
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,,British
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,,British
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,,British
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,,British
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,,British


In [64]:
#len(missing_cases - set(filtered["IID"].to_list()))

# 4. Identify Sex Column

In [64]:
# male is denoted a 0, female as 1
def find_sex(row):
    if row["f.31.0.0"] == "Male":
        return 0
    return 1

sex = filtered[["f.31.0.0"]].apply(find_sex, axis=1)
sex

1         0
2         0
3         1
4         1
5         0
         ..
486411    1
486412    1
486413    1
486414    1
486415    0
Length: 440265, dtype: int64

In [65]:
filtered["sex"] = sex

  filtered["sex"] = sex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["sex"] = sex


In [66]:
filtered

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
1,1000022,1000022,Male,1954,August,Yes,,,,Yes,...,,,,,,,,,British,0
2,1000035,1000035,Male,1944,May,No,,,,Yes,...,,,,,,,,,British,0
3,1000046,1000046,Female,1946,March,No,,No,,No,...,,,,,,,,,British,1
4,1000054,1000054,Female,1942,January,No,,,,Yes,...,,,,,,,,,British,1
5,1000063,1000063,Male,1967,April,No,,,,No,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,No,,,,Yes,...,,,,,,,,,British,1
486412,6025409,6025409,Female,1946,November,No,No,,,No,...,,,,,,,,,British,1
486413,6025411,6025411,Female,1960,November,No,,,,No,...,,,,,,,,,British,1
486414,6025425,6025425,Female,1963,August,No,,,,No,...,,,,,,,,,British,1


In [67]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [68]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,,British,0
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,,British,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,,British,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,486411,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,,British,1
440261,486412,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,,British,1
440262,486413,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,,British,1
440263,486414,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,,British,1


In [69]:
saved_filtered = filtered

In [70]:
filtered = saved_filtered

In [71]:
filtered[filtered["ethnicity"] == "British"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,,British,0
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,,British,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,,British,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,486411,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,,British,1
440261,486412,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,,British,1
440262,486413,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,,British,1
440263,486414,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,,British,1


In [72]:
filtered[filtered["ethnicity"] == "Irish"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
7,8,1000090,1000090,Female,1945,July,Yes,,,,...,,,,,,,,,Irish,1
34,37,1000415,1000415,Male,1942,December,No,,,,...,,,,,,,,,Irish,0
113,125,1001316,1001316,Male,1964,September,No,,Yes,,...,,,,,,,,,Irish,0
129,143,1001492,1001492,Male,1947,August,No,,,,...,,,,,,,,,Irish,0
178,195,1002031,1002031,Female,1946,February,No,,,,...,,,,,,,,,Irish,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439903,486001,6021166,6021166,Female,1947,June,No,,,,...,,,,,,,,,Irish,1
439990,486100,6022183,6022183,Male,1946,June,Yes,,,,...,,,,,,,,,Irish,0
440048,486165,6022857,6022857,Male,1964,November,No,,,,...,,,,,,,,,Irish,0
440112,486234,6023551,6023551,Female,1966,February,No,,No,,...,,,,,,,,,Irish,1


In [73]:
filtered[filtered["ethnicity"] == "White"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
765,850,1008841,1008841,Female,1964,October,Yes,,,,...,,,,,,,,,White,1
1505,1665,1017312,1017312,Female,1947,March,No,,,,...,,,,,,,,,White,1
1704,1884,1019550,1019550,Female,1942,July,No,,,,...,,,,,,,,,White,1
7977,8845,1091708,1091708,Female,1965,February,No,,,,...,,,,,,,,,White,1
9107,10089,1104586,1104586,Female,1941,November,No,,,,...,,,,,,,,,White,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435193,480759,5967068,5967068,Male,1947,December,Yes,,,,...,,,,,,,,,White,0
436852,482627,5986358,5986358,Female,1955,May,No,,,,...,,,,,,,,,White,1
437942,483842,5998866,5998866,Male,1946,March,No,,,,...,,,,,,,,,White,0
438135,484053,6001050,6001050,Female,1939,May,No,,,,...,,,,,,,,,White,1


In [74]:
filtered[filtered["ethnicity"] == "Inconsistent_white"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
608,681,1007061,1007061,Male,1950,May,No,,No,,...,,,,,,,,,Inconsistent_white,0
2177,2407,1024996,1024996,Female,1939,February,No,Do not know,,,...,,,,,,,,,Inconsistent_white,1
3610,3999,1041484,1041484,Male,1951,December,No,No,,,...,,,,,,,,,Inconsistent_white,0
3946,4369,1045313,1045313,Male,1953,August,No,No,,,...,,,,,,,,,Inconsistent_white,0
5215,5778,1059988,1059988,Male,1950,July,No,No,No,,...,,,,,,,,,Inconsistent_white,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435690,481348,5973198,5973198,Female,1941,April,No,No,No,,...,,,,,,,,,Inconsistent_white,1
436484,482218,5982180,5982180,Male,1967,June,No,Yes,,,...,,,,,,,,,Inconsistent_white,0
436771,482540,5985476,5985476,Female,1955,October,No,No,No,,...,,,,,,,,,Inconsistent_white,1
437208,483033,5990529,5990529,Female,1949,August,No,No,No,,...,,,,,,,,,Inconsistent_white,1


In [75]:
filtered[filtered["ethnicity"] == "Unknown"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
58,65,1000701,1000701,Female,1949,October,No,,,,...,,,,,,,,,Unknown,1
668,745,1007738,1007738,Female,1944,December,No,,,,...,,,,,,,,,Unknown,1
770,856,1008909,1008909,Male,1944,March,No,,,,...,,,,,,,,,Unknown,0
778,865,1008997,1008997,Male,1945,August,No,,,,...,,,,,,,,,Unknown,0
1010,1122,1011645,1011645,Male,1960,June,Yes,,,,...,,,,,,,,,Unknown,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437819,483710,5997502,5997502,Male,1946,November,,,,,...,,,,,,,,,Unknown,0
439471,485524,6016238,6016238,Female,1953,February,No,,,,...,,,,,,,,,Unknown,1
439670,485740,6018439,6018439,Male,1964,January,No,,,,...,,,,,,,,,Unknown,0
439691,485763,6018674,6018674,Female,1947,April,Yes,,,,...,,,,,,,,,Unknown,1


In [76]:
filtered[filtered["ethnicity"] == "Any_other_white_background"]

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
28,30,1000331,1000331,Female,1956,December,No,,,,...,,,,,,,,,Any_other_white_background,1
65,72,1000776,1000776,Female,1946,June,No,,,,...,,,,,,,,,Any_other_white_background,1
67,74,1000799,1000799,Male,1963,July,No,,,,...,,,,,,,,,Any_other_white_background,0
73,80,1000858,1000858,Male,1947,May,No,,,,...,,,,,,,,,Any_other_white_background,0
77,86,1000914,1000914,Male,1962,July,No,,,,...,,,,,,,,,Any_other_white_background,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440143,486270,6023920,6023920,Female,1967,September,No,,,,...,,,,,,,,,Any_other_white_background,1
440155,486283,6024051,6024051,Female,1960,February,No,,,,...,,,,,,,,,Any_other_white_background,1
440160,486288,6024100,6024100,Male,1952,June,Yes,,,,...,,,,,,,,,Any_other_white_background,0
440169,486298,6024208,6024208,Female,1965,November,Yes,,,,...,,,,,,,,,Any_other_white_background,1


# 5. f.3393, f.2247, f.2257, and Mendelian

## 5.1. Remove inconsistencies or unclear individuals

Some individuals might be unclear on if they do or do not have hearing difficulties or are inconsistent (found in f.3393, f.2247, and f.2257), in which case they cannot be considered either controls or cases and must be removed.

The conditions for being removed are as follows:
* Saying I don't know after saying either yes or no
* Only saying I don't know or prefer not to say
* Being completely deaf

### 5.1.1. Prior to filtering for inconsistencies

<b>Hearing difficulty/problems with background noise</b> <br>
f.2257 = {'Yes': 81218, NA : 513774, 'No': 131091, 'Do not know': 4409, 'Prefer not to answer': 208}

<b>Hearing difficult/problems</b><br>
f.2247 = {'No': 151758, : 513806, 'Yes': 55437, 'Do not know': 9489, 'Prefer not to answer': 171, 'I am completely deaf': 39}

<b>Hearing aid user</b><br>
f.3393 = {'No': 145486, : 577795, 'Yes': 7237, 'Prefer not to answer': 182}

### 5.1.2. Setup for inconsistency filtering

In [77]:
# collect all the columns 
# redefining here for clarity

hearing_imp_f3393 = [col for col in filtered if "f.3393" in col]
hearing_imp_f2247 = [col for col in filtered if "f.2247" in col]
hearing_imp_f2257 = [col for col in filtered if "f.2257" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [78]:
# for hearing impairement questions we code the answers with the following for comparison
hearing_ans = {"Do not know":9, "Yes":1, "No":0}

# will contain the complete set of actual combinations of answers from individuals in the database
options = set()

# pass one pheno at a time to this function
def find_options(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    options.add(answer)

In [79]:
# this builds the options set to contain a set of all the unique answers the individuals in the database have had
# over their hearing impairment questions
hearing_imp_qs = filtered[hearing_imp_f3393]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2247]
s = hearing_imp_qs.apply(find_options, axis=1)
hearing_imp_qs = filtered[hearing_imp_f2257]
s = hearing_imp_qs.apply(find_options, axis=1)

In [None]:
options

In [80]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removingthe answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [81]:
inconsistent

['0090',
 '0990',
 '0009',
 '090',
 '0999',
 '009',
 '9090',
 '9099',
 '909',
 '0900',
 '0909',
 '0099',
 '9009',
 '9990',
 '09',
 '099',
 '1911',
 '1119',
 '919',
 '9911',
 '119',
 '19',
 '9991',
 '191',
 '199',
 '1191',
 '9119',
 '9191',
 '1100',
 '1110',
 '110',
 '010',
 '10',
 '1000',
 '0100',
 '101',
 '0110',
 '100',
 '1001',
 '0101',
 '1011',
 '1010',
 '0010',
 '1101',
 '1090',
 '1099',
 '0199',
 '109',
 '0191',
 '1900',
 '1019',
 '1910',
 '019',
 '9110',
 '0901',
 '0919',
 '0119',
 '0910',
 '1190',
 '1901',
 '0190',
 '190',
 '0019',
 '910',
 '1009']

### 5.1.3. Filtering out the data

In [82]:
# return True if you find an individual that has NA for every answer in each column
def find_empty(row):
    for i in row:
        if not pd.isna(i):
            return False
    return True

In [83]:
# return True if an individual only answers Do not know but never Yes or No
def find_dont_know(row):
    temp = []
    for i in row:
        if not pd.isna(i):
            temp.append(i)
        
    if "Do not know" in temp and "Yes" not in temp and "No" not in temp:
        return True
    return False

In [84]:
# will return true if that row should be removed
# pass one pheno at a time through the function
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

# if we have individuals that either don't answer or prefer not to say only then we cancel them out
# return true if all the rows have no definitive answers
def find_all_none(row):
    for i in row:
        if not pd.isna(i) and (i == "Yes" or i == "No"):
            return False
    return True

In [85]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,,British,0
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,,British,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,,British,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,486411,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,,British,1
440261,486412,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,,British,1
440262,486413,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,,British,1
440263,486414,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,,British,1


In [86]:
# filter out inconsistencies for f3393
hearing_imp_qs = filtered[hearing_imp_f3393]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [87]:
len(missing_cases - set(filtered["IID"].to_list()))

NameError: name 'missing_cases' is not defined

In [88]:
# filter out inconsistencies for f2247
hearing_imp_qs = filtered[hearing_imp_f2247]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [None]:
len(missing_cases - set(filtered["IID"].to_list()))

In [89]:
# filter out inconsistencies for f2257
hearing_imp_qs = filtered[hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [None]:
len(missing_cases - set(filtered["IID"].to_list()))

In [90]:
# filter out individuals that don't have a definiteive answer for any hearing aid questions
hearing_imp_qs = filtered[hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257]
exclude = hearing_imp_qs.apply(find_all_none, axis=1)
filtered = filtered[~exclude]

In [None]:
len(missing_cases - set(filtered["IID"].to_list()))

In [91]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,,British,0
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,,British,0
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,,British,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,,British,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,,British,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,486411,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,,British,1
440261,486412,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,,British,1
440262,486413,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,,British,1
440263,486414,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,,British,1


In [None]:
saved_2_filtered = filtered

In [None]:
filtered = saved_2_filtered

## 5.2. Identify Pure Controls

Need to make sure that for f.3393, f.2247, and f.2257 we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all f.3393, f.2247, and f.2257). However these individuals can still be part of the cases

In [92]:
# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(hearing_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return 0
    return 1

# returns 0 if it's a ctrl or else 1
# this is specific for f3393 because f3393 was only asked under certain circumstances
def find_ctrl_or_NA(row):
    for i in row:
        if not pd.isna(i) and i != "No" and i != "Prefer not to answer": # if we have any answers that are not NA or No only then we don't have a ctrl
            return 1
    return 0


In [93]:
# filter through the hearing impairment questions to find the controls
hearing_imp_qs = filtered[hearing_imp_f3393]
f3393_ctrl = hearing_imp_qs.apply(find_ctrl_or_NA, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2247]
f2247_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()
hearing_imp_qs = filtered[hearing_imp_f2257]
f2257_ctrl = hearing_imp_qs.apply(find_ctrl, axis=1).to_list()

In [94]:
pure_ctrl = [0 if i == 0 and f2247_ctrl[en] == 0 and f2257_ctrl[en] == 0 else 1 for en, i in enumerate(f3393_ctrl)]

In [95]:
print(len(pure_ctrl) - sum(pure_ctrl), "individuals are controls prior to filtration for icd10, icd9 and self-reported codes")

239260 individuals are controls prior to filtration for icd10, icd9 and self-reported codes


### 5.2.1. Collect ICD 10 codes to filter out from Ctrl

In [96]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
97,f.41270,H83.3 Noise effects on inner ear,24.0,N,Y,,,,,,
98,f.41270,H83.8 Other specified diseases of inner ear,51.0,N,Y,,,,,,
99,f.41270,"H83.9 Disease of inner ear, unspecified",33.0,N,Y,,,,,,
105,f.41270,"H90.3 Sensorineural hearing loss, bilateral",721.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
106,f.41270,"H90.4 Sensorineural hearing loss, unilateral w...",185.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
107,f.41270,"H90.5 Sensorineural hearing loss, unspecified",880.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
108,f.41270,H90.6 Mixed conductive and sensorineural heari...,133.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
109,f.41270,H90.7 Mixed conductive and sensorineural heari...,75.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,N,,
110,f.41270,H90.8 Mixed conductive and sensorineural heari...,115.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,Individuals with this code were initially excl...
113,f.41270,H91.1 Presbycusis,408.0,N,Y,,,,N,,


In [97]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

['H833',
 'H838',
 'H839',
 'H903',
 'H904',
 'H905',
 'H906',
 'H907',
 'H908',
 'H911',
 'H912',
 'H913',
 'H918',
 'H919',
 'H930',
 'H931',
 'H932',
 'H933',
 'H938',
 'H939',
 'Z461',
 'Z974']

In [98]:
icd10 = filtered[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,F101,J342,R619,S8280,W010,,,,,,...,,,,,,,,,,
1,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,...,,,,,,,,,,
2,E780,G473,R065,R074,Z824,,,,,,...,,,,,,,,,,
3,C679,C787,C795,C798,D090,I802,I959,J181,K922,M169,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,A099,D649,E279,E538,E559,I10,I839,K449,K573,K649,...,,,,,,,,,,
440261,,,,,,,,,,,...,,,,,,,,,,
440262,O149,O266,O342,O471,O48,O610,O680,Z370,,,...,,,,,,,,,,
440263,G551,M501,,,,,,,,,...,,,,,,,,,,


In [99]:
# collect the individuals who should not be part of controls because of icd 10 codes
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### 5.2.2. Collect ICD 9 codes to filter out from Ctrl

In [100]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
371,f.41271,3880 Degenerative and vascular disorders of ear,0.0,N,Y,,,,,,
372,f.41271,3881 Noise effects on inner ear,0.0,N,Y,,,,,,
373,f.41271,"3882 Sudden hearing loss, unspecified",0.0,N,Y,,,,,,
374,f.41271,3883 Tinnitus,11.0,N,Y,,,,,,
375,f.41271,3884 Other abnormal auditory perception,0.0,N,Y,,,,,,
379,f.41271,3888 Other specified disorders of ear,1.0,N,Y,,,,,,
380,f.41271,"3889 Disorders of ear, unspecified",2.0,N,Y,,,,,,
383,f.41271,3891 Sensorineural deafness,6.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
384,f.41271,3892 Mixed conductive and sensorineural deafness,1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,
385,f.41271,"3897 Deaf mutism, not elsewhere classifiable",1.0,N,Y,Y if individual is positive for f3393 and >55yo,Y if individual is positive for f2247 and >55yo,Y if individual is positive for f2257 and >55yo,Y,,


In [101]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

['3880',
 '3881',
 '3882',
 '3883',
 '3884',
 '3888',
 '3889',
 '3891',
 '3892',
 '3897',
 '3898',
 '3899',
 'V412',
 'V532']

In [102]:
icd9 = filtered[icd9_colnames]
icd9

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,3000,5198,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,,,,,,,,,,,...,,,,,,,,,,
440261,,,,,,,,,,,...,,,,,,,,,,
440262,,,,,,,,,,,...,,,,,,,,,,
440263,,,,,,,,,,,...,,,,,,,,,,


In [103]:
# collect the individuals who should not be part of controls because of icd 9 codes
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### 5.2.3. Collect f20002 codes to filter out from Ctrl

In [104]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

Unnamed: 0,UKBB_field_code,Phenotype,cases_UKB_showcase,Excluded_fulldb_lateonsetHI,Excluded_from_controls,Cases_f3393,Cases_f2247,Cases_f2257,Cases_Mendelian_like,Excluded_from_controls_Medelian_like,observations
543,f.20002,1597 tinnitus / tiniitis,1950.0,N,Y,,,,,,


In [105]:
f20002 = filtered[f20002_colnames]
f20002

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
0,1065,,,,,,,,,,...,,,,,,,,,,
1,1396,1473,,,,,,,,,...,,,,,,,,,,
2,1065,1294,1476,1473,1374,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,1387,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,1464,,,,,,,,,,...,,,,,,,,,,
440261,1478,1473,,,,,,,,,...,,,,,,,,,,
440262,,,,,,,,,,,...,,,,,,,,,,
440263,1265,,,,,,,,,,...,,,,,,,,,,


In [106]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

['1597']

In [107]:
# collect the individuals who should not be part of controls because of self-reported codes
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### 5.2.4. Filter out the HI Ctrl

In [108]:
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

In [109]:
# because individuals that are controls are labeled as 0
# temp says True if an individual is not a control and False if it is a control
# ex_10, ex_9, and ex_f20002 are True for individuals that are not controls and False for individuals that are controls
temp = pure_ctrl | ex_10 | ex_9 | ex_f20002

# we set the control as 0 for each individual that is False in temp
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [110]:
print(len(filtered_ctrl) - sum(filtered_ctrl), "individuals are controls after addition filtration for icd10, icd9 and self-reported codes")

237318 individuals are controls after addition filtration for icd10, icd9 and self-reported codes


In [111]:
filtered["hearing_imp_pure_ctrl"] = filtered_ctrl

  filtered["hearing_imp_pure_ctrl"] = filtered_ctrl


In [112]:
filtered

Unnamed: 0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,...,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0,ethnicity,sex,hearing_imp_pure_ctrl
0,1,1000022,1000022,Male,1954,August,Yes,,,,...,,,,,,,,British,0,1
1,2,1000035,1000035,Male,1944,May,No,,,,...,,,,,,,,British,0,1
2,3,1000046,1000046,Female,1946,March,No,,No,,...,,,,,,,,British,1,1
3,4,1000054,1000054,Female,1942,January,No,,,,...,,,,,,,,British,1,1
4,5,1000063,1000063,Male,1967,April,No,,,,...,,,,,,,,British,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
440260,486411,6025390,6025390,Female,1942,March,No,,,,...,,,,,,,,British,1,1
440261,486412,6025409,6025409,Female,1946,November,No,No,,,...,,,,,,,,British,1,0
440262,486413,6025411,6025411,Female,1960,November,No,,,,...,,,,,,,,British,1,0
440263,486414,6025425,6025425,Female,1963,August,No,,,,...,,,,,,,,British,1,0


In [113]:
filtered = filtered.reset_index()

  filtered = filtered.reset_index()


In [None]:
saved_3_filtered = filtered

In [None]:
filtered = saved_3_filtered

## 5.3. Identify All Age and Phenotype Columns

In [114]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

['f.21003.0.0', 'f.21003.1.0', 'f.21003.2.0', 'f.21003.3.0']

In [115]:
ages_f131258_col = [col.strip('"') for col in header if 'f.131258' in col]
ages_f131258_col

['f.131258.0.0']

In [116]:
filtered[ages_f21003_col]

Unnamed: 0,f.21003.0.0,f.21003.1.0,f.21003.2.0,f.21003.3.0
0,53,,,
1,63,,,
2,62,,73,
3,65,,,
4,43,,,
...,...,...,...,...
429094,67,,,
429095,61,66,72,
429096,49,,,
429097,44,,,


In [117]:
filtered[ages_f131258_col]

Unnamed: 0,f.131258.0.0
0,
1,
2,
3,
4,
...,...
429094,
429095,
429096,
429097,


In [118]:
# get the latest time that an individual said no to any of the phenotypes
# return the oldest age that they were
def get_ctrl_age(row):
    phens = [hearing_imp_f3393, hearing_imp_f2247, hearing_imp_f2257]
    ages = []
    if row["hearing_imp_pure_ctrl"] == 0:
        temp_all_ages = row[ages_f21003_col].to_list()
        temp_all_ages.reverse()
        
        for phen in phens:
            temp = row[phen].to_list()
            temp.reverse()
            for en, i in enumerate(temp):
                if not pd.isna(i) and i == "No":
                    ages.append(temp_all_ages[en])
                    break
        ages.sort()
        return ages[-1]
    return pd.NA

# get the earliest time that an individual said yes to having a phenotype
def get_phen_age(row):
    if row[0] == 1:
        temp = row[1:-4].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[ages_f21003_col][en]
    else:
        return pd.NA
    
# return the minium age in the row, or else return NA
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA
    

In [119]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i == "Yes":
            return 1
    return 0

# return 1 if we have a match for the mendelian traits and have at least one of the hearing phenotypes
def find_medelian_like(row):
    mendelian_icd10 = ["H903", "H905", "H906", "H908", "H913", "H918", "H919"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]

    if 1 in row[["f3393", "f2247", "f2257"]].to_list():
        for en, i in enumerate(row[icd_10_cols]):
            if not pd.isna(i) and i in mendelian_icd10:
                return 1, get_ages_from_birth(row[icd10_ages[en:en+1]+year_of_birth+month_of_birth])
        for en, i in enumerate(row[icd_9_cols]):
            if not pd.isna(i) and i in mendelian_icd9:
                return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
    return 0, pd.NA

# return a 0 if the individual is not a case and 1 if they are a case
def find_exclusions(row):
    mendelian_icd10 = ["H903", "H904", "H905", "H906", "H907", "H908"]
    mendelian_icd9 = ["3891", "3892", "3897", "3898", "3899"]

    try:
        if 1 == row[0]: # the first column will be one of the phenotypes, "f3393", "f2247", or "f2257"
            for en, i in enumerate(row[icd_10_cols]):
                if not pd.isna(i) and i in mendelian_icd10:
                    if get_ages_from_birth(row[ages_f131258_col+year_of_birth+month_of_birth]) <= 55: # row[1] must be the age of the phenotype
                        return 0, pd.NA
                    return 1, get_ages_from_birth(row[ages_f131258_col+year_of_birth+month_of_birth])
            for en, i in enumerate(row[icd_9_cols]):
                if not pd.isna(i) and i in mendelian_icd9:
                    if get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth]) <= 55:
                        return 0, pd.NA
                    return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
        return int(row[0]), row[1] # the second column should be the ages of that phenotype
    except:
        print(row["name"])


# return 1 if we have a match for the other cases of f3393 or originally had f3393
def find_f3393_other_cases(row):
    icd10 = ["Z461", "Z974"]
    icd9 = ["V412", "V532"]
    if 0 == int(row["f3393"]):
        for en, i in enumerate(row[icd_10_cols]):
            if not pd.isna(i) and i in icd10:
                return 1, get_ages_from_birth(row[icd10_ages[en:en+1]+year_of_birth+month_of_birth])
        for en, i in enumerate(row[icd_9_cols]):
            if not pd.isna(i) and i in icd9:
                return 1, get_ages_from_birth(row[icd9_ages[en:en+1]+year_of_birth+month_of_birth])
    return int(row["f3393"]), row["f3393_age"]

# check if the h919 code exists in the individual
def check_code(row):
    for i in row:
        if not pd.isna(i) and i == "H919":
            return 1
    return 0

In [120]:
def get_ages_from_birth(row):
    month_dict = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}
    year = 0
    month = 0
    i = row[0]
    if not pd.isna(i):
        i = i.split("-")
        year = i[0]
        month = i[1]
    if int(month) >= month_dict[row[month_of_birth[0]]]:
        return int(year) - int(row[year_of_birth[0]])
    return int(year) - int(row[year_of_birth[0]]) - 1

In [121]:
# f3393
hearing_imp_qs = filtered[hearing_imp_f3393]
filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f3393", "f3393_age"]] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')
filtered[["f3393", "f3393_age"]] = filtered[["f3393", "f3393_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + year_of_birth + month_of_birth].apply(find_f3393_other_cases, axis=1, result_type='expand')

  filtered["f3393"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f3393_age"] = filtered[["f3393"] + hearing_imp_f3393 + ages_f21003_col].apply(get_phen_age, axis=1)


In [122]:
# check if we have f3393 ages
filtered[(filtered["f3393"] == 1) & (pd.isna(filtered["f3393_age"])) & (pd.isna(filtered[ages_f131258_col[0]]))][["f3393", "f3393_age"] + ages_f131258_col]

Unnamed: 0,f3393,f3393_age,f.131258.0.0


In [123]:
# f2247
hearing_imp_qs = filtered[hearing_imp_f2247]
filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f2247", "f2247_age"]] = filtered[["f2247", "f2247_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')

  filtered["f2247"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2247_age"] = filtered[["f2247"] + hearing_imp_f2247 + ages_f21003_col].apply(get_phen_age, axis=1)


In [124]:
# check if we have f2247 ages
filtered[(filtered["f2247"] == 1) & (pd.isna(filtered["f2247_age"]))][["f2247", "f2247_age"]]

Unnamed: 0,f2247,f2247_age


In [125]:
# f2257
hearing_imp_qs = filtered[hearing_imp_f2257]
filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)
filtered[["f2257", "f2257_age"]] = filtered[["f2257", "f2257_age"] + icd_10_cols + icd_9_cols + icd10_ages + icd9_ages + ages_f131258_col + year_of_birth + month_of_birth].apply(find_exclusions, axis=1, result_type='expand')

  filtered["f2257"] = hearing_imp_qs.apply(find_yes, axis=1)
  filtered["f2257_age"] = filtered[["f2257"] + hearing_imp_f2257 + ages_f21003_col].apply(get_phen_age, axis=1)


In [126]:
# check if we have empty f2257 ages
filtered[(filtered["f2257"] == 1) & (pd.isna(filtered["f2257_age"]))][["f2257", "f2257_age"]]

Unnamed: 0,f2257,f2257_age


In [127]:
# mendelian
filtered[["mendelian", "mendelian_age"]] = filtered[icd_10_cols + icd_9_cols + ["f3393", "f2247", "f2257"] + icd10_ages + icd9_ages + year_of_birth + month_of_birth].apply(find_medelian_like, axis=1, result_type='expand')

  self[k1] = value[k2]


In [128]:
# check if we have empty mendelian ages
filtered[(filtered["mendelian"] == 1) & (pd.isna(filtered["mendelian_age"]))][["mendelian", "mendelian_age"]]

Unnamed: 0,mendelian,mendelian_age


In [129]:
# ctrl age
filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)

  filtered["ctrl_age"] = filtered[["hearing_imp_pure_ctrl"] + ages_f21003_col +  hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257].apply(get_ctrl_age, axis=1)


In [130]:
# check if we have empty ctrl ages
filtered[(filtered["hearing_imp_pure_ctrl"] == 0) & (pd.isna(filtered["ctrl_age"]))][["hearing_imp_pure_ctrl", "ctrl_age"]]

Unnamed: 0,hearing_imp_pure_ctrl,ctrl_age


In [131]:
# f2247_f2257
filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)

  filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"]
  filtered["f2247_f2257_age"] = filtered[["f2247_age", "f2257_age"]].apply(get_min_age, axis=1)


In [132]:
# check if we have empty f2247_f2257 ages
filtered[(filtered["f2247_f2257"] == 1) & (pd.isna(filtered["f2247_f2257_age"]))][["f2247_f2257", "f2247_f2257_age"]]

Unnamed: 0,f2247_f2257,f2247_f2257_age


## 5.4. File Output

In [133]:
filtered

Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f3393_age,f2247,f2247_age,f2257,f2257_age,mendelian,mendelian_age,ctrl_age,f2247_f2257,f2247_f2257_age
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,,1,53,1,53,0,,,1,53
1,1,2,1000035,1000035,Male,1944,May,No,,,...,,0,,1,63,0,,,0,63
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,,0,,1,73,0,,,0,73
3,3,4,1000054,1000054,Female,1942,January,No,,,...,,0,,1,65,0,,,0,65
4,4,5,1000063,1000063,Male,1967,April,No,,,...,,0,,0,,0,,43,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,,0,,1,67,0,,,0,67
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,,0,,0,,0,,66,0,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,,0,,0,,0,,49,0,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,,0,,0,,0,,44,0,


In [None]:
filtered[filtered["hearing_imp_pure_ctrl"] == 0][["FID", "IID", "sex", "hearing_imp_pure_ctrl", "ctrl_age", "ethnicity"]].to_csv("pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f3393"] == 1][["FID", "IID", "sex", "f3393", "f3393_age", "ethnicity"]]

In [None]:
filtered[filtered["f3393"] == 1][["FID", "IID", "sex", "f3393", "f3393_age", "ethnicity"]].to_csv("f3393_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2247"] == 1][["FID", "IID", "sex", "f2247", "f2247_age", "ethnicity"]]

In [None]:
filtered[filtered["f2247"] == 1][["FID", "IID", "sex", "f2247", "f2247_age", "ethnicity"]].to_csv("f2247_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2257"] == 1][["FID", "IID", "sex", "f2257", "f2257_age", "ethnicity"]]

In [None]:
filtered[filtered["f2257"] == 1][["FID", "IID", "sex", "f2257", "f2257_age", "ethnicity"]].to_csv("f2257_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["f2247_f2257"] == 1][["FID", "IID", "sex", "f2247_f2257", "f2247_f2257_age", "ethnicity"]]

In [None]:
filtered[filtered["f2247_f2257"] == 1][["FID", "IID", "sex", "f2247_f2257", "f2247_f2257_age", "ethnicity"]].to_csv("f2247_f2257_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["mendelian"] == 1][["FID", "IID", "sex", "mendelian", "mendelian_age", "ethnicity"]]

In [None]:
filtered[filtered["mendelian"] == 1][["FID", "IID", "sex", "mendelian", "mendelian_age", "ethnicity"]].to_csv("mendelian_pheno_file.tsv", sep='\t', index=False)

# 6. Tinnitus

## 6.1. Remove inconsistencies or unclear individuals

### 6.1.1. Prior to filtering for inconsistencies

<b>Tinnitus</b> <br>
f.4803 = {'No, never': 76141,
 'Yes, but not now, but have in the past': 11400,
 'Yes, now some of the time': 9788,
 'Yes, now a lot of the time': 2973,
 'Yes, now most or all of the time': 7426,
 'Do not know': 1745,
 'Prefer not to answer': 127}

### 6.1.2. Inconsistencies in the tinnitus answers

In [80]:
filtered = saved_filtered

In [79]:
tin_cols = [col for col in filtered if "f.4803" in col]

icd_10_cols = [col for col in df if "f.41270" in col]
icd_9_cols = [col for col in df if "f.41271" in col]

In [80]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}
options = set()
# pass one pheno at a time
def find_options(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    options.add(answer)

In [81]:
s = filtered[tin_cols].apply(find_options, axis=1)

In [82]:
options

{'',
 '0',
 '00',
 '000',
 '0000',
 '0001',
 '001',
 '0010',
 '0011',
 '009',
 '01',
 '010',
 '0100',
 '0101',
 '0109',
 '011',
 '0110',
 '0111',
 '019',
 '09',
 '090',
 '091',
 '099',
 '1',
 '10',
 '100',
 '1000',
 '101',
 '1010',
 '1011',
 '11',
 '110',
 '1100',
 '1101',
 '111',
 '1110',
 '1111',
 '119',
 '19',
 '190',
 '191',
 '1911',
 '1919',
 '199',
 '9',
 '90',
 '900',
 '901',
 '9011',
 '909',
 '91',
 '911',
 '99',
 '990',
 '991',
 '999'}

In [None]:
# we might have inconsistencies if we have don't knows mixed with other answers or even yes and no together
# options contains the set of all unique codes 

do_not_know_no = [i for i in options if '0' in i and '9' in i and '1' not in i]
do_not_know_yes = [i for i in options if '0' not in i and '9' in i and '1' in i]
yes_no = [i for i in options if '0' in i and '9' not in i and '1' in i]
with_all_three = [i for i in options if '0' in i and '9' in i and '1' in i]

# collecting all possible occurances that can be flagged for being inconsistent
might_inconsistent = do_not_know_no + do_not_know_yes + yes_no + with_all_three

# these are exceptions to the possibilities we have that can be flagged for being inconsistent
exceptions = ["91","911","9111","991","0001","001","0011","01","011","0111", "0091", "091","0911","0991","9001","901","9011", "90", "900", "9000", "990"]

# collect the list of answer combinations that are actually inconsistent by removing the answer combinations that are exceptions
inconsistent = [i for i in might_inconsistent if i not in exceptions]

In [None]:
inconsistent

#### 6.1.2.1. Filtering out the data

In [None]:
filtered

In [None]:
# will return true if that row should be removed
# pass one pheno at a time
def find_inconsistencies(row):
    for i in row:
        if not pd.isna(i) and i == "I am completely deaf":
            return True
    
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "I am completely deaf" and i != "Prefer not to answer"])
    if answer in inconsistent:
        return True
    return False

In [None]:
exclude = filtered[tin_cols].apply(find_inconsistencies, axis=1)
filtered = filtered[~exclude]

In [None]:
filtered

## 6.2. Identify Pure Control

Need to make sure that for tinnitus we obtain the individuals that say no.

The conditions for saying no are as follows:
* Never saying yes
* Not being inconsistent
* Saying no at least once (prefer not to say is allowed)

We are also not including individuals to be part of the control group if they have certain codes for ICD9, ICD10, or f.20002 (this is in the case that they say no to all tinnitus). However these individuals can still be part of the cases

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

# returns 0 if it's a ctrl or else 1
def find_ctrl(row):
    answer = "".join([str(tin_ans[i]) for i in row if not pd.isna(i) and i != "Prefer not to answer"])
    if "0" in answer and "1" not in answer:
        return False
    return True

In [None]:
f4803_ctrl = filtered[tin_cols].apply(find_ctrl, axis=1)

In [None]:
sum(f4803_ctrl)

### 6.2.1. Collect ICD 10 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd10 = exclusion[(exclusion["UKBB_field_code"] == "f.41270") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd10

In [None]:
ex_critia_ctrl_icd10 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd10["Phenotype"].tolist()]
ex_critia_ctrl_icd10

In [None]:
icd10 = filtered[icd10_colnames]
icd10

In [None]:
ex_fxn_icd10 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd10)
ex_10 = icd10.apply(ex_fxn_icd10, axis=1)

### 6.2.2. Collect ICD 9 codes to filter out from Ctrl

In [None]:
exclude_ctrl_icd9 = exclusion[(exclusion["UKBB_field_code"] == "f.41271") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_icd9

In [None]:
ex_critia_ctrl_icd9 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_icd9["Phenotype"].tolist()]
ex_critia_ctrl_icd9

In [None]:
icd9 = filtered[icd9_colnames]
icd9

In [None]:
ex_fxn_icd9 = lambda row: contains_exclusion(row, ex_critia_ctrl_icd9)
ex_9 = icd9.apply(ex_fxn_icd9, axis=1)

### 6.2.3. Collect f20002 codes to filter out from Ctrl

In [None]:
exclude_ctrl_f20002 = exclusion[(exclusion["UKBB_field_code"] == "f.20002") & (exclusion["Excluded_from_controls"] == 'Y') ]
exclude_ctrl_f20002

In [None]:
f20002 = filtered[f20002_colnames]
f20002

In [None]:
ex_critia_ctrl_f20002 = ["".join( (i.split(" ")[0]).split(".") ) for i in exclude_ctrl_f20002["Phenotype"].tolist()]
ex_critia_ctrl_f20002

In [None]:
ex_fxn_f20002 = lambda row: contains_exclusion(row, ex_critia_ctrl_f20002)
ex_f20002 = f20002.apply(ex_fxn_f20002, axis=1)

### 6.2.4. Collect individuals with other tinnitus codes to filter out from CTRL

In [None]:
# check if the given code exists in the individuals
def check_code(row, code):
    for i in row:
        if not pd.isna(i) and i == code:
            return 1
    return 0

In [None]:
tinn_icd10_check_code = lambda row: check_code(row, "H931")
tinn_icd10 = filtered[icd_10_cols].apply(tinn_icd10_check_code, axis = 1)

In [None]:
tinn_icd9_check_code = lambda row: check_code(row, "3883")
tinn_icd9 = filtered[icd_9_cols].apply(tinn_icd9_check_code, axis = 1)

In [None]:
self_report_cols = [col for col in filtered if "f.20002" in col]
tinn_self_report_check_code = lambda row: check_code(row, "1597")
tinn_self_report = filtered[self_report_cols].apply(tinn_self_report_check_code, axis = 1)

### 6.2.5. Filter out Tinnitus Ctrl

In [None]:
sum(f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
temp = f4803_ctrl | ex_10 | ex_9 | ex_f20002 | tinn_icd10 | tinn_icd9 | tinn_self_report
filtered_ctrl = [1 if i else 0 for i in temp.to_list()]

In [None]:
filtered["tinnitus_pure_ctrl"] = filtered_ctrl

In [None]:
filtered

## 6.3. Identify Age

In [None]:
ages_f21003_col = [col for col in filtered if "f.21003" in col]
ages_f21003_col

In [None]:
#get the minimum age of each individual in the given columns
def get_min_age(row):
    temp = [i for i in row.to_list() if not pd.isna(i)]
    if len(temp) > 0:
        return min(temp)
    else:
        return pd.NA

In [None]:
filtered["tinnitus_age"] = filtered[ages_f21003_col].apply(get_min_age, axis=1)

In [None]:
saved_tinn = filtered

In [None]:
filtered = saved_tinn

## 6.4. Noisy workplace and Loud Music Variable

Aim to exam interactions for each of the ARHI traits with noise. The two noise phenotypes that we will be using are f.4825 noisy workplace and f.4836 loud music exposure frequency.

### 6.4.1. Check for inconsistencies

<b>f.4825 "Have you ever worked in a noisy place where you had to shout to be heard?"</b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

<b>f.4836 "Have you ever listened to music for more than 3 hours per week at a volume which you would need to shout to be heard or, if wearing headphones, someone else would need to shout for you to hear them?" </b> <br>
"No"= "0", <br>
"Yes, for less than a year"= "1", <br>
"Yes, for around 1-5 years"= "2", <br>
"Yes, for more than 5 years"= "3", <br>
"Prefer not to answer"= NA, <br>
"Do not know"= NA <br>

In [None]:
filtered

In [134]:
filtered.to_csv("~/project/guangyou/Tinnitus/saved_filtered_withARHI_20220214.csv",index=False)

In [158]:
import pandas as pd
filtered = pd.read_csv("~/project/guangyou/Tinnitus/saved_filtered_withARHI_20220214.csv")
filtered

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f3393_age,f2247,f2247_age,f2257,f2257_age,mendelian,mendelian_age,ctrl_age,f2247_f2257,f2247_f2257_age
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,,1,53.0,1,53.0,0,,,1,53.0
1,1,2,1000035,1000035,Male,1944,May,No,,,...,,0,,1,63.0,0,,,0,63.0
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,,0,,1,73.0,0,,,0,73.0
3,3,4,1000054,1000054,Female,1942,January,No,,,...,,0,,1,65.0,0,,,0,65.0
4,4,5,1000063,1000063,Male,1967,April,No,,,...,,0,,0,,0,,43.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,,0,,1,67.0,0,,,0,67.0
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,,0,,0,,0,,66.0,0,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,,0,,0,,0,,49.0,0,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,,0,,0,,0,,44.0,0,


In [159]:
header = filtered.columns
noise_wp_cols =  [col.strip('"') for col in header if "f.4825" in col]
loud_music_cols =  [col.strip('"') for col in header if "f.4836" in col]
combined_cols = indiv + noise_wp_cols + loud_music_cols

In [142]:
# database of all individuals that we are working with and the selected phenotypes
tmp = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
tmp

Unnamed: 0,IID,FID,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,1000019,1000019,,,,,,,,
1,1000022,1000022,,,,,,,,
2,1000035,1000035,,,,,,,,
3,1000046,1000046,,,No,,,,No,
4,1000054,1000054,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,No,,,,No,,,
486412,6025409,6025409,,No,,,,No,,
486413,6025411,6025411,No,,,,No,,,
486414,6025425,6025425,,,,,,,,


In [143]:
tmp = tmp[tmp["IID"].isin(filtered["IID"].to_list())]
tmp

Unnamed: 0,IID,FID,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
1,1000022,1000022,,,,,,,,
2,1000035,1000035,,,,,,,,
3,1000046,1000046,,,No,,,,No,
4,1000054,1000054,,,,,,,,
5,1000063,1000063,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,No,,,,No,,,
486412,6025409,6025409,,No,,,,No,,
486413,6025411,6025411,No,,,,No,,,
486414,6025425,6025425,,,,,,,,


In [144]:
tmp = tmp.reset_index()
tmp

Unnamed: 0,index,IID,FID,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,1,1000022,1000022,,,,,,,,
1,2,1000035,1000035,,,,,,,,
2,3,1000046,1000046,,,No,,,,No,
3,4,1000054,1000054,,,,,,,,
4,5,1000063,1000063,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...
429094,486411,6025390,6025390,No,,,,No,,,
429095,486412,6025409,6025409,,No,,,,No,,
429096,486413,6025411,6025411,No,,,,No,,,
429097,486414,6025425,6025425,,,,,,,,


In [160]:
filtered = pd.concat([filtered,tmp[['f.4825.0.0', 'f.4825.1.0', 'f.4825.2.0', 'f.4825.3.0','f.4836.0.0', 'f.4836.1.0', 'f.4836.2.0', 'f.4836.3.0']]], axis=1)
filtered

Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f2247_f2257,f2247_f2257_age,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,1,53.0,,,,,,,,
1,1,2,1000035,1000035,Male,1944,May,No,,,...,0,63.0,,,,,,,,
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,0,73.0,,,No,,,,No,
3,3,4,1000054,1000054,Female,1942,January,No,,,...,0,65.0,,,,,,,,
4,4,5,1000063,1000063,Male,1967,April,No,,,...,0,,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,0,67.0,No,,,,No,,,
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,0,,,No,,,,No,,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,0,,No,,,,No,,,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,0,,,,,,,,,


In [161]:
filtered.to_csv("~/project/guangyou/Tinnitus/saved_filtered.csv",index=False)

Four instances are placed in time order, according to https://biobank.ndph.ox.ac.uk/showcase/field.cgi?id=4836

In [1]:
import pandas as pd
filtered = pd.read_csv("~/project/guangyou/Tinnitus/saved_filtered.csv")
filtered

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f2247_f2257,f2247_f2257_age,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,1,53.0,,,,,,,,
1,1,2,1000035,1000035,Male,1944,May,No,,,...,0,63.0,,,,,,,,
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,0,73.0,,,No,,,,No,
3,3,4,1000054,1000054,Female,1942,January,No,,,...,0,65.0,,,,,,,,
4,4,5,1000063,1000063,Male,1967,April,No,,,...,0,,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,0,67.0,No,,,,No,,,
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,0,,,No,,,,No,,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,0,,No,,,,No,,,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,0,,,,,,,,,


In [2]:
header = filtered.columns
noise_wp_cols =  [col.strip('"') for col in header if "f.4825" in col]
loud_music_cols =  [col.strip('"') for col in header if "f.4836" in col]

In [29]:
filtered[noise_wp_cols].iloc[:,0].value_counts()

No                            108556
Yes, for more than 5 years     17123
Yes, for around 1-5 years       8068
Yes, for less than a year       7707
Do not know                     1185
Prefer not to answer              98
Name: f.4825.0.0, dtype: int64

In [4]:
filtered[noise_wp_cols].iloc[:,2].value_counts()

No                            29085
Yes, for more than 5 years     3406
Yes, for around 1-5 years      1747
Yes, for less than a year      1694
Do not know                     186
Prefer not to answer             55
Name: f.4825.2.0, dtype: int64

In [3]:
noise_loud_answers = {"No":0, "Yes, for less than a year":1, "Yes, for around 1-5 years":2, "Yes, for more than 5 years":3}

# if the answers we have are not sorted in the order that they're in the list, then that individual is inconsistent
def find_inconsistencies_noisy_loud(row):
    ans = [noise_loud_answers[i] for i in row if i in noise_loud_answers.keys()] 
    return sorted(ans) != ans
        

In [175]:
row = filtered.iloc[27,:][noise_wp_cols]
row

f.4825.0.0      No
f.4825.1.0    <NA>
f.4825.2.0    <NA>
f.4825.3.0    <NA>
Name: 27, dtype: object

In [10]:
exclude = filtered[noise_wp_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered[exclude][["f3393"]+hearing_imp_f3393+noise_wp_cols]

Unnamed: 0,f3393,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0
678,1,Yes,Yes,Yes,Yes,,"Yes, for less than a year","Yes, for less than a year",No
801,1,No,,Yes,,"Yes, for more than 5 years",,No,
909,1,Yes,,Yes,,"Yes, for less than a year",,No,
1064,0,No,No,,,"Yes, for around 1-5 years",No,,
1334,0,No,,No,,"Yes, for less than a year",,No,
...,...,...,...,...,...,...,...,...,...
426521,0,No,,No,,"Yes, for more than 5 years",,"Yes, for less than a year",
426784,0,No,No,No,,,"Yes, for more than 5 years","Yes, for around 1-5 years",
426852,0,No,,No,No,,,"Yes, for more than 5 years","Yes, for around 1-5 years"
427786,0,No,,No,,"Yes, for less than a year",,No,


In [4]:
# for the inconsistant cases, set them as NA instead of removing them
hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
exclude = filtered[noise_wp_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered.loc[exclude,hearing_imp_f3393+hearing_imp_f2247+hearing_imp_f2257] = pd.NA
filtered

Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f2247_f2257,f2247_f2257_age,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,1,53.0,,,,,,,,
1,1,2,1000035,1000035,Male,1944,May,No,,,...,0,63.0,,,,,,,,
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,0,73.0,,,No,,,,No,
3,3,4,1000054,1000054,Female,1942,January,No,,,...,0,65.0,,,,,,,,
4,4,5,1000063,1000063,Male,1967,April,No,,,...,0,,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,0,67.0,No,,,,No,,,
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,0,,,No,,,,No,,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,0,,No,,,,No,,,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,0,,,,,,,,,


In [5]:
filtered[exclude][["f3393"]+hearing_imp_f3393]

Unnamed: 0,f3393,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0
678,1,,,,
801,1,,,,
909,1,,,,
1064,0,,,,
1334,0,,,,
...,...,...,...,...,...
426521,0,,,,
426784,0,,,,
426852,0,,,,
427786,0,,,,


In [None]:
exclude.sum()

1350

In [5]:
exclude = filtered[loud_music_cols].apply(find_inconsistencies_noisy_loud, axis=1)
filtered.loc[exclude,hearing_imp_f3393+hearing_imp_f2247+hearing_imp_f2257] = pd.NA
filtered

Unnamed: 0,level_0,index,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,...,f2247_f2257,f2247_f2257_age,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f.4836.0.0,f.4836.1.0,f.4836.2.0,f.4836.3.0
0,0,1,1000022,1000022,Male,1954,August,Yes,,,...,1,53.0,,,,,,,,
1,1,2,1000035,1000035,Male,1944,May,No,,,...,0,63.0,,,,,,,,
2,2,3,1000046,1000046,Female,1946,March,No,,No,...,0,73.0,,,No,,,,No,
3,3,4,1000054,1000054,Female,1942,January,No,,,...,0,65.0,,,,,,,,
4,4,5,1000063,1000063,Male,1967,April,No,,,...,0,,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,440260,486411,6025390,6025390,Female,1942,March,No,,,...,0,67.0,No,,,,No,,,
429095,440261,486412,6025409,6025409,Female,1946,November,No,No,,...,0,,,No,,,,No,,
429096,440262,486413,6025411,6025411,Female,1960,November,No,,,...,0,,No,,,,No,,,
429097,440263,486414,6025425,6025425,Female,1963,August,No,,,...,0,,,,,,,,,


In [41]:
filtered[exclude][["f3393"]+hearing_imp_f3393]

Unnamed: 0,f3393,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0
452,,,,,
886,,,,,
1414,,,,,
1573,,,,,
1619,,,,,
...,...,...,...,...,...
428241,,,,,
428264,,,,,
428499,,,,,
428522,,,,,


In [40]:
exclude.sum()

1371

### 6.4.2. The associations between each noise phenotype and each ARHI phenotype

For the separate analysis of f.4825 and f.4836, it's aiming to test each one of the noise phenotypes for associations with each ARHI phenotype 
$$ARHI phenotype \sim noise phenotype$$

Binary trait

Criterion 1:
* 0 = No
* 1 = Yes either > 5 years or 1-5 years
* Remove those who answer < 1 year

Criterion 2:
* 0 = No or answer < 1 year
* 1 = Yes either > 5 years or 1-5 years

For 4 visits, we will use their response from the last visit on f.4825 and f.4836 and for cases we will use their response at the first visit which they reported a hearing problem.

In [6]:
header = filtered.columns
indiv = ["IID", "FID"]
icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]
ethnicity = [col.strip('"') for col in header if "f.21000." in col]
hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
tin_cols = [col.strip('"') for col in header if "f.4803." in col]
ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
year_of_birth = [col.strip('"') for col in header if "f.34." in col]
month_of_birth = [col.strip('"') for col in header if "f.52." in col]

noise_wp_cols =  [col.strip('"') for col in header if "f.4825" in col]
loud_music_cols =  [col.strip('"') for col in header if "f.4836" in col]

In [7]:
# f3393
age_col = hearing_imp_f3393
phe_col = "f3393"

# f4825 
noi_col = noise_wp_cols

# f3339 case: get the earliest time that an individual said yes to having a phenotype
# f3339 ctrl: get the latest time that an individual said no to the phenotypes
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["f3393_f4825_1"] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered[[phe_col]+age_col+noi_col+["f3393_f4825_1"]]

Unnamed: 0,f3393,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f3393_f4825_1
0,0,No,,,,,,,,
1,0,No,,,,,,,,
2,0,,,No,,,,No,,No
3,0,No,,,,,,,,
4,0,No,,,,No,,,,No
...,...,...,...,...,...,...,...,...,...,...
429094,0,No,,,,No,,,,No
429095,0,,No,,,,No,,,No
429096,0,No,,,,No,,,,No
429097,0,,,,,,,,,


In [8]:
filtered["f3393_f4825_1"].value_counts(dropna=False)

NaN                           258791
No                            131186
Yes, for more than 5 years     19766
Yes, for around 1-5 years       9224
Yes, for less than a year       8696
Do not know                     1339
Prefer not to answer              97
Name: f3393_f4825_1, dtype: int64

In [8]:
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["f3393_f4825_1"] =filtered["f3393_f4825_1"].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["f3393_f4825_1"].value_counts(dropna=False)

NaN    268923
0.0    131186
1.0     28990
Name: f3393_f4825_1, dtype: int64

In [None]:
tmp = filtered[[phe_col]+age_col+noi_col+["f3393_f4825_1"]]
tmp[tmp["f3393"]==1]

Unnamed: 0,f3393,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0,f3393_f4825_1
9,1,,,Yes,,,,No,,0
85,1,Yes,,,,No,,,,0
114,1,Yes,,,,"Yes, for around 1-5 years",,,,1
120,1,Yes,,,,,,,,
214,1,Yes,,,,No,,,,0
...,...,...,...,...,...,...,...,...,...,...
428815,1,Yes,,,,,,,,
428861,1,No,,Yes,,,,No,,0
428914,1,Yes,,,,"Yes, for less than a year",,,,
428949,1,Yes,,,,No,,,,0


In [56]:
tmp[tmp["f3393"]==1]["f3393_f4825_1"].isnull().sum()

8907

In [57]:
tmp[tmp["f3393"]==1]["f3393_f4825_1"].value_counts(dropna=False)

NaN    8907
0.0    4313
1.0    2132
Name: f3393_f4825_1, dtype: int64

In [10]:
filtered[filtered["f3393"]==0]["f3393_f4825_1"].value_counts(dropna=False)

NaN    259767
0.0    126873
1.0     26858
Name: f3393_f4825_1, dtype: int64

In [10]:
from scipy.stats import pearsonr
tmp = filtered[["f3393","f3393_f4825_1"]].dropna(axis=0)
pearsonr(tmp["f3393"], tmp["f3393_f4825_1"])

(0.07967150711232371, 8.278815285206419e-224)

In [62]:
from scipy.stats import spearmanr
spearmanr(tmp["f3393"], tmp["f3393_f4825_1"])

SpearmanrResult(correlation=0.07967150711232363, pvalue=8.278815283401719e-224)

In [11]:
from sklearn.linear_model import LogisticRegression
import numpy as np
Y = tmp["f3393"].values
X = np.array(tmp["f3393_f4825_1"].values).reshape(-1, 1)
reg = LogisticRegression(fit_intercept =True).fit(X, Y)
reg.score(X, Y)

0.9597630106882429

In [12]:
reg.coef_

array([[0.84741825]])

In [13]:
reg.intercept_

array([-3.38134983])

In [20]:
import statsmodels.api as sm
import numpy as np
phe_col = "f3393"
noi_col = loud_music_cols
noi_phe = "f4825"
tmp = filtered[[phe_col,"%s_%s_1"%(phe_col,noi_phe)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_1"%(phe_col,noi_phe)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(fii.summary())

Optimization terminated successfully.
         Current function value: 0.165980
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:               160176
Model:                          Logit   Df Residuals:                   160174
Method:                           MLE   Df Model:                            1
Date:                Wed, 16 Feb 2022   Pseudo R-squ.:                 0.01610
Time:                        20:57:22   Log-Likelihood:                -26586.
converged:                       True   LL-Null:                       -27021.
Covariance Type:            nonrobust   LLR p-value:                2.895e-191
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.3816      0.015   -218.397      0.000      -3.412      -3.351
x1             0.8480      0.

In [31]:
print(fii.params[1],fii.prsquared,fii.pvalues[1])

0.8480496466039168 0.016102927967829972 1.177247773222503e-211


In [100]:
# f4836 
noi_col = loud_music_cols

# f3339 case: get the earliest time that an individual said yes to having a phenotype
# f3339 ctrl: get the latest time that an individual said no to the phenotypes
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["f3393_f4836_1"] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["f3393_f4836_1"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["f3393_f4836_1"] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)


NaN                           256279
No                            148501
Yes, for around 1-5 years       7488
Yes, for more than 5 years      7139
Yes, for less than a year       4888
Do not know                     2236
Prefer not to answer              56
Name: f3393_f4836_1, dtype: int64

In [105]:
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["f3393_f4836_1"] =filtered["f3393_f4836_1"].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["f3393_f4836_1"].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["f3393_f4836_1"] =filtered["f3393_f4836_1"].replace({"No": 0, \


NaN    263459
0.0    148501
1.0     14627
Name: f3393_f4836_1, dtype: int64

In [78]:
filtered[filtered["f3393"]==1]["f3393_f4836_1"].value_counts(dropna=False)

NaN    8800
0.0    5910
1.0     642
Name: f3393_f4836_1, dtype: int64

In [83]:
filtered[filtered["f3393"]==0]["f3393_f4836_1"].value_counts(dropna=False)

NaN    254659
0.0    142591
1.0     13985
Name: f3393_f4836_1, dtype: int64

In [86]:
tmp = filtered[["f3393","f3393_f4836_1"]].dropna(axis=0)
pearsonr(tmp["f3393"], tmp["f3393_f4836_1"])

(0.005956773809117956, 0.016133091576173948)

In [80]:
Y = tmp["f3393"].values
X = np.array(tmp["f3393_f4836_1"].values).reshape(-1, 1)
reg = LinearRegression().fit(X, Y)
reg.score(X, Y)

3.548315421308956e-05

In [81]:
reg.coef_

array([0.00409372])

In [82]:
reg.intercept_

0.03979771179992054

In [88]:
tmp.value_counts(dropna=False)

f3393  f3393_f4836_1
0      0                142591
       1                 13985
1      0                  5910
       1                   642
dtype: int64

In [110]:
# combined f4825_f4836 

phe_col = "f3393"
# No to both f.4825 and f.4836 vs yes >5 years or 1-5 years for either f.4825 or f.4836.  
# If they did not answer yes to either to >5years or 1-5 years for either f.4825 or f.4836 but they were exposed to <1 year of noise for one or both traits they should be removed.
# def get_combined_phen(row):
#     if row["%s_f4825_1"%(phe_col)] == "No" and row["%s_f4836_1"%(phe_col)] == "No" :
#         return 0
#     elif row["%s_f4825_1"%(phe_col)] == "Yes, for around 1-5 years" or row["%s_f4825_1"%(phe_col)] == "Yes, for more than 5 years" or row["%s_f4836_1"%(phe_col)] == "Yes, for around 1-5 years"  or row["%s_f4836_1"%(phe_col)] == "Yes, for more than 5 years":
#         return 1
#     else:
#         return pd.NA
def get_combined_phen(row):
    if not pd.isna(row["%s_f4825_1"%(phe_col)]) and not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 0 and row["%s_f4836_1"%(phe_col)] == 0 :
        return 0
    elif (not pd.isna(row["%s_f4825_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 1) or (not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4836_1"%(phe_col)] == 1):
        return 1
    else:
        return pd.NA
    
filtered["%s_f4825_f4836_1"%(phe_col)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
filtered["%s_f4825_f4836_1"%(phe_col)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_f4825_f4836_1"%(phe_col)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)


NaN    269355
0.0    118493
1.0     38739
Name: f3393_f4825_f4836_1, dtype: int64

In [111]:
tmp = filtered[["f3393","%s_f4825_f4836_1"%(phe_col)]].dropna(axis=0)
pearsonr(tmp["f3393"], tmp["%s_f4825_f4836_1"%(phe_col)])

(0.06694575768860471, 1.3115085605871955e-155)

In [112]:
Y = tmp["f3393"].values
X = np.array(tmp["%s_f4825_f4836_1"%(phe_col)].values).reshape(-1, 1)
reg = LinearRegression().fit(X, Y)
print(pearsonr(tmp[phe_col], tmp["%s_f4825_f4836_1"%(phe_col)]),reg.score(X, Y),reg.coef_,reg.intercept_)

(0.06694575768860471, 1.3115085605871955e-155) 0.004481734472501442 [0.0305965] 0.03287957938443625


In [10]:
import statsmodels.api as sm
from scipy.stats import pearsonr
import numpy as np
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
def get_combined_phen(row):
    if not pd.isna(row["%s_f4825_1"%(phe_col)]) and not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 0 and row["%s_f4836_1"%(phe_col)] == 0 :
        return 0
    elif (not pd.isna(row["%s_f4825_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 1) or (not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4836_1"%(phe_col)] == 1):
        return 1
    else:
        return pd.NA

In [11]:
# f3393
age_col = hearing_imp_f3393
phe_col = "f3393"
code = 1
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

code = 2
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# No or answer < 1 year vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# No or answer < 1 year vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# combined noise
noi_phe = "f4825_f4836"
code  = 1
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
code  = 2
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

Optimization terminated successfully.
         Current function value: 0.165980
         Iterations 7
(0.07967150711232361, 8.2788152850859475e-224) 0.8480496466039167 0.016102927967829972 1.1772477732226375e-211
Optimization terminated successfully.
         Current function value: 0.168450
         Iterations 7
(0.005956773809117956, 0.016133091576173948) 0.10218225901985212 0.00010262348557366074 0.016177596608613173
Optimization terminated successfully.
         Current function value: 0.165462
         Iterations 7
(0.07782585686987882, 4.151022276123206e-225) 0.8415264609638171 0.015347167829443342 4.807002954123868e-213
Optimization terminated successfully.
         Current function value: 0.167724
         Iterations 7
(0.006236843983763254, 0.010573930752444942) 0.10843251606605112 0.00011278478810317072 0.010610705216070495
Optimization terminated successfully.
         Current function value: 0.167235
         Iterations 7
(0.06694575768860471, 1.3115085605871955e-155) 0.689

In [12]:
# f2247
age_col = hearing_imp_f2247
phe_col = "f2247"
code = 1
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

code = 2
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# combined noise
noi_phe = "f4825_f4836"
code  = 1
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
code  = 2
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

Optimization terminated successfully.
         Current function value: 0.571162
         Iterations 5
(0.18278616024304534, 0.0) 0.9760433946929985 0.026410626749403976 0.0
Optimization terminated successfully.
         Current function value: 0.583752
         Iterations 5
(0.07427325824751606, 8.106822468482063e-184) 0.5437305658355673 0.004425763415719652 6.712007973135533e-180
Optimization terminated successfully.
         Current function value: 0.574041
         Iterations 5
(0.17513457015038492, 0.0) 0.9525005348953774 0.024174400156988796 0.0
Optimization terminated successfully.
         Current function value: 0.585066
         Iterations 5
(0.07218929844644426, 8.748581185649002e-179) 0.5353368320930187 0.0041760421118344215 4.4493063281419186e-175
Optimization terminated successfully.
         Current function value: 0.570645
         Iterations 5
(0.18472426753545568, 0.0) 0.9027795888048036 0.02750919089120507 0.0
Optimization terminated successfully.
         Current fun

In [13]:
# f2257
age_col = hearing_imp_f2257
phe_col = "f2257"
code = 1
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

code = 2
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# combined noise
noi_phe = "f4825_f4836"
code  = 1
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
code  = 2
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

Optimization terminated successfully.
         Current function value: 0.637681
         Iterations 4
(0.16302450280020991, 0.0) 0.853174621678098 0.019691082542203087 0.0
Optimization terminated successfully.
         Current function value: 0.648030
         Iterations 4
(0.07165101177414718, 1.9336738617432378e-172) 0.5077057298801098 0.003817780798108461 2.9307968754757973e-169
Optimization terminated successfully.
         Current function value: 0.640741
         Iterations 4
(0.1548040749725963, 0.0) 0.8262158903915151 0.017719959032647004 0.0
Optimization terminated successfully.
         Current function value: 0.649397
         Iterations 4
(0.06914721921162689, 2.1247175162456496e-165) 0.4964730694091865 0.003552470140585795 1.799025341207671e-162
Optimization terminated successfully.
         Current function value: 0.636066
         Iterations 5
(0.16973878546016408, 0.0) 0.801301085644823 0.02154302296984345 0.0
Optimization terminated successfully.
         Current funct

The defination of noise phenotype for the combined trait is a problem. According to the previous code, filtered["f2247_f2257"] = filtered["f2247"] & filtered["f2257"], and the ages are defined by the min(f2247_age, f2257_age)

In [22]:
# f2247_f2257
age_col = hearing_imp_f2247+hearing_imp_f2257
phe_col = "f2247_f2257"
code = 1
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered[[phe_col,"f2247","f2257"]+age_col+noi_col]

Unnamed: 0,f2247_f2257,f2247,f2257,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,f.2257.1.0,f.2257.2.0,f.2257.3.0,f.4825.0.0,f.4825.1.0,f.4825.2.0,f.4825.3.0
0,1,1,1,Yes,,,,Yes,,,,,,,
1,0,0,1,No,,,,Yes,,,,,,,
2,0,0,1,No,,No,,No,,Yes,,,,No,
3,0,0,1,No,,,,Yes,,,,,,,
4,0,0,0,No,,,,No,,,,No,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429094,0,0,1,No,,,,Yes,,,,No,,,
429095,0,0,0,No,No,,,No,No,,,,No,,
429096,0,0,0,No,,,,No,,,,No,,,
429097,0,0,0,No,,,,No,,,,,,,


In [33]:
# case
row  = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].iloc[0,:]
temp = row[hearing_imp_f2247].to_list()
en_1 = pd.NA
en_2 = pd.NA
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "Yes":
        en_1 = en
        break
    
temp = row[hearing_imp_f2257].to_list()
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "Yes":
        en_2 = en
        break     
row[noi_col][min(en_1 , en_2)]

nan

In [34]:
print(en_1 , en_2)

0 0


In [39]:
# control
row  = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].iloc[2,:]
en_1 = pd.NA
en_2 = pd.NA
temp = row[hearing_imp_f2247].to_list()
temp.reverse()
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "No":
        en_1 = en
        break
temp = row[hearing_imp_f2257].to_list()
temp.reverse()
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "No":
        en_2 = en
        break   
row[noi_col][-min(en_1 , en_2)-1]

'No'

In [40]:
row  = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].iloc[1,:]
en_1 = pd.NA
en_2 = pd.NA
temp = row[hearing_imp_f2247].to_list()
temp.reverse()
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "No":
        en_1 = en
        break
temp = row[hearing_imp_f2257].to_list()
temp.reverse()
for en, i in enumerate(temp):
    if not pd.isna(i) and i == "No":
        en_2 = en
        break   
row[noi_col][-min(en_1 , en_2)-1]

TypeError: boolean value of NA is ambiguous

In [42]:
def get_phen_combined(row):
    if row[phe_col] == 1:
        en_1 = pd.NA
        en_2 = pd.NA
        temp = row[hearing_imp_f2247].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                en_1 = en
                break
        temp = row[hearing_imp_f2257].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                en_2 = en
                break     
        row[noi_col][min(en_1 , en_2)]
        return row[noi_col][en]
    elif row[phe_col] == 0:
        en_1 = pd.NA
        en_2 = pd.NA
        temp = row[hearing_imp_f2247].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                en_1 = en
                break
        temp = row[hearing_imp_f2257].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                en_2 = en
                break   
        row[noi_col][-min(en_1 , en_2)-1]
        return row[noi_col][-en-1]
    else:
        return pd.NA
def get_combined_phen(row):
    if not pd.isna(row["%s_f4825_1"%(phe_col)]) and not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 0 and row["%s_f4836_1"%(phe_col)] == 0 :
        return 0
    elif (not pd.isna(row["%s_f4825_1"%(phe_col)]) and row["%s_f4825_1"%(phe_col)] == 1) or (not pd.isna(row["%s_f4836_1"%(phe_col)]) and row["%s_f4836_1"%(phe_col)] == 1):
        return 1
    else:
        return pd.NA

In [44]:
# f2247_f2257
age_col = hearing_imp_f2247+hearing_imp_f2257
phe_col = "f2247_f2257"
code = 1
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].apply(get_phen_combined, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].apply(get_phen_combined, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": pd.NA, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

code = 2
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].apply(get_phen_combined, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[[phe_col,"f2247","f2257"]+age_col+noi_col].apply(get_phen_combined, axis=1)
# Remove those who answer < 1 year
# No vs. Yes either > 5 years or 1-5 years
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] =filtered["%s_%s_%i"%(phe_col,noi_phe,code)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
# combined noise
noi_phe = "f4825_f4836"
code  = 1
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])
code  = 2
filtered["%s_%s_%i"%(phe_col,noi_phe,code)] = filtered[["%s_f4825_1"%(phe_col),"%s_f4836_1"%(phe_col)]].apply(get_combined_phen, axis=1)
tmp = filtered[[phe_col,"%s_%s_%i"%(phe_col,noi_phe,code)]].dropna(axis=0)
Y = tmp[phe_col].values
X = sm.add_constant(np.array(tmp["%s_%s_%i"%(phe_col,noi_phe,code)].values, dtype=float).reshape(-1, 1))
mod = sm.Logit(Y,X)
fii = mod.fit()
print(pearsonr(tmp[phe_col], tmp["%s_%s_%i"%(phe_col,noi_phe,code)]),fii.params[1],fii.prsquared,fii.pvalues[1])

TypeError: boolean value of NA is ambiguous

### 6.4.3. The associations between each noise phenotype and each ARHI phenotype

For the interaction analysis of f.4825 and f.4836, it's aiming to test interaction between the noise phenotypes and genetic variants 
$$ARHI phenotype \sim x + noise phenotype + x * noise phenotype$$

Binary criterion:
* 0 = No or < 1 year
* 1 = Yes either > 5 years or 1-5 years

In [121]:
# f3393
age_col = hearing_imp_f2247
phe_col = "f3393"
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    306215
0.0    104221
1.0     16151
Name: f3393_f4825_2, dtype: int64

In [122]:
# f3393
# f4836 
phe_col = "f3393"
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    306633
0.0    111279
1.0      8675
Name: f3393_f4836_2, dtype: int64

In [124]:
phe_col = "f3393"
def get_combined_phen(row):
    if not pd.isna(row["%s_f4825_2"%(phe_col)]) and not pd.isna(row["%s_f4836_2"%(phe_col)]) and row["%s_f4825_2"%(phe_col)] == 0 and row["%s_f4836_2"%(phe_col)] == 0 :
        return 0
    elif (not pd.isna(row["%s_f4825_2"%(phe_col)]) and row["%s_f4825_2"%(phe_col)] == 1) or (not pd.isna(row["%s_f4836_2"%(phe_col)]) and row["%s_f4836_2"%(phe_col)] == 1):
        return 1
    else:
        return pd.NA
filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)
filtered["%s_f4825_f4836_2"%(phe_col)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)


NaN    307012
0.0     97250
1.0     22325
Name: f3393_f4825_f4836_2, dtype: int64

In [125]:
# f2247
age_col = hearing_imp_f2247
phe_col = "f2247"
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    270336
0.0    130314
1.0     25937
Name: f2247_f4825_2, dtype: int64

In [126]:
# f2247
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    271016
0.0    142481
1.0     13090
Name: f2247_f4836_2, dtype: int64

In [127]:
filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)
filtered["%s_f4825_f4836_2"%(phe_col)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)


NaN    271468
0.0    120381
1.0     34738
Name: f2247_f4825_f4836_2, dtype: int64

In [128]:
# f2257
age_col = hearing_imp_f2247
phe_col = "f2257"
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    296888
0.0    108465
1.0     21234
Name: f2257_f4825_2, dtype: int64

In [129]:
# f2247
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    297483
0.0    118577
1.0     10527
Name: f2257_f4836_2, dtype: int64

In [130]:
filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)
filtered["%s_f4825_f4836_2"%(phe_col)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)


NaN    297821
0.0    100494
1.0     28272
Name: f2257_f4825_f4836_2, dtype: int64

In [131]:
# f2247_f2257
age_col = hearing_imp_f2247
phe_col = "f2247_f2257"
# f4825 
noi_col = noise_wp_cols
noi_phe = "f4825"
def get_phen_age(row):
    if row[phe_col] == 1:
        temp = row[age_col].to_list()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "Yes":
                return row[noi_col][en]
    elif row[phe_col] == 0:
        temp = row[age_col].to_list()
        temp.reverse()
        for en, i in enumerate(temp):
            if not pd.isna(i) and i == "No":
                return row[noi_col][-en-1]
    else:
        return pd.NA
    
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    276919
0.0    125141
1.0     24527
Name: f2247_f2257_f4825_2, dtype: int64

In [132]:
# f2247_f2257
# f4836 
noi_col = loud_music_cols
noi_phe = "f4836"
filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \
                                                              "Yes, for less than a year": 0, \
                                                              "Yes, for around 1-5 years": 1, \
                                                              "Yes, for more than 5 years": 1, \
                                                              "Do not know": pd.NA, \
                                                              "Prefer not to answer": pd.NA})
filtered["%s_%s_2"%(phe_col,noi_phe)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] = filtered[[phe_col]+age_col+noi_col].apply(get_phen_age, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_%s_2"%(phe_col,noi_phe)] =filtered["%s_%s_2"%(phe_col,noi_phe)].replace({"No": 0, \


NaN    277584
0.0    136630
1.0     12373
Name: f2247_f2257_f4836_2, dtype: int64

In [133]:
filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)
filtered["%s_f4825_f4836_2"%(phe_col)].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered["%s_f4825_f4836_2"%(phe_col)] = filtered[["%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col)]].apply(get_combined_phen, axis=1)


NaN    277997
0.0    115725
1.0     32865
Name: f2247_f2257_f4825_f4836_2, dtype: int64

In [None]:
### 6.4.4. Save noise phenotype files

In [134]:
filtered.to_csv("~/project/guangyou/Tinnitus/saved_filtered_withallnoise_20220215.csv",index=False)

In [145]:
df = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2",header=0,sep="\t")
df

Unnamed: 0,FID,IID,sex,f3393,age,PC1,PC2
0,1000112,1000112,0,1,68,0.011931,0.031198
1,1001067,1001067,0,1,50,0.004472,0.002904
2,1001384,1001384,1,1,61,0.002001,-0.001850
3,1001459,1001459,0,1,64,0.010770,0.022391
4,1002548,1002548,0,1,62,0.000226,-0.014383
...,...,...,...,...,...,...,...
252914,6025363,6025363,0,0,64,0.004982,0.006218
252915,6025409,6025409,1,0,66,0.002693,-0.003802
252916,6025411,6025411,1,0,49,0.014998,0.007031
252917,6025425,6025425,1,0,44,0.017568,0.020817


In [146]:
phe_col = "f3393"
df = df.merge(filtered[["IID","%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col),"%s_f4825_f4836_2"%(phe_col)]],how = "left",left_on="IID",right_on="IID")
df

Unnamed: 0,FID,IID,sex,f3393,age,PC1,PC2,f3393_f4825_2,f3393_f4836_2,f3393_f4825_f4836_2
0,1000112,1000112,0,1,68,0.011931,0.031198,0,0,0
1,1001067,1001067,0,1,50,0.004472,0.002904,0,0,0
2,1001384,1001384,1,1,61,0.002001,-0.001850,1,1,1
3,1001459,1001459,0,1,64,0.010770,0.022391,,,
4,1002548,1002548,0,1,62,0.000226,-0.014383,0,0,0
...,...,...,...,...,...,...,...,...,...,...
252914,6025363,6025363,0,0,64,0.004982,0.006218,,,
252915,6025409,6025409,1,0,66,0.002693,-0.003802,0,0,0
252916,6025411,6025411,1,0,49,0.014998,0.007031,0,0,0
252917,6025425,6025425,1,0,44,0.017568,0.020817,,,


In [148]:
df.to_csv("~/UKBiobank/results/REGENIE_results/results_imputed_data/2022_02_15_regenie_interaction_noise/phenotypes/021522_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_noise",index=False,sep="\t")

In [149]:
# f2247
phe_col = "f2247"
df = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/100521_UKBB_Hearing_difficulty_f2247_expandedwhite_110453cases_237318ctrl_500k_PC1_PC2",header=0,sep="\t")
df = df.merge(filtered[["IID","%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col),"%s_f4825_f4836_2"%(phe_col)]],how = "left",left_on="IID",right_on="IID")
df.to_csv("~/UKBiobank/results/REGENIE_results/results_imputed_data/2022_02_15_regenie_interaction_noise/phenotypes/021522_UKBB_Hearing_difficulty_f2247_expandedwhite_110453cases_237318ctrl_500k_PC1_PC2_noise",index=False,sep="\t")

In [150]:
# f2257
phe_col = "f2257"
df = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/100521_UKBB_Hearing_noise_f2257_expandedwhite_161443cases_237318ctrl_500k_PC1_PC2",header=0,sep="\t")
df = df.merge(filtered[["IID","%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col),"%s_f4825_f4836_2"%(phe_col)]],how = "left",left_on="IID",right_on="IID")
df.to_csv("~/UKBiobank/results/REGENIE_results/results_imputed_data/2022_02_15_regenie_interaction_noise/phenotypes/021522_UKBB_Hearing_noise_f2257_expandedwhite_161443cases_237318ctrl_500k_PC1_PC2_noise",index=False,sep="\t")

In [151]:
# f2247_f2257
phe_col = "f2247_f2257"
df = pd.read_csv("~/UKBiobank/phenotype_files/hearing_impairment/fulldb_500K/100521_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2",header=0,sep="\t")
df = df.merge(filtered[["IID","%s_f4825_2"%(phe_col),"%s_f4836_2"%(phe_col),"%s_f4825_f4836_2"%(phe_col)]],how = "left",left_on="IID",right_on="IID")
df.to_csv("~/UKBiobank/results/REGENIE_results/results_imputed_data/2022_02_15_regenie_interaction_noise/phenotypes/021522_UKBB_Combined_f2247_f2257_expandedwhite_93258cases_237318ctrl_500k_PC1_PC2",index=False,sep="\t")

## 6.5. Identify Cases

**Analysis plan:**

1. Individuals who currently have tinnitus (all four yes categories) vs no never only controlling for sex, age, noisy workplace and loud music frequency ("crude") (No tinnitus vs anytype of tinnitus). For this analysis the missing data of the noise variables was imputed using the median for cases and controls separately

3. Individuals in the two top YES categories vs NO never ('No' tinnitus vs 'Yes, now all of the time' and 'Yes, now most of the time')

4. Individuals that say yes in the top 3 categories vs No never (Remove category 'yes, but not now, but have it in the past')

5. Individuals who currently have tinnitus (all four yes categories and tinnitus codes). Not filtering for issues with noisy workplace and loud music.

In [None]:
# return 1 if we have a yes (used to find phenos)
def find_yes(row):
    for i in row:
        if not pd.isna(i) and i != "Prefer not to answer" and i in tin_ans.keys() and tin_ans[i] == 1:
            return 1
    return 0

### 6.5.1. Analysis Plan 1

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_1"] = tinn_yes

### 6.5.2. Analysis Plan 2

In [None]:
tin_ans = {"Do not know":9, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
filtered["tinnitus_2"] = tinn_yes

### 6.5.3. Analysis Plan 3

In [None]:
tin_ans = {"Do not know":9, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes)

In [None]:
filtered["tinnitus_3"] = tinn_yes

### 6.5.4. Analysis Plan 4

In [None]:
filtered_for4 = saved_tinn

In [None]:
tin_ans = {"Do not know":9, 'Yes, but not now, but have in the past':1, 'Yes, now some of the time':1, 'Yes, now a lot of the time':1, 'Yes, now most or all of the time':1, 'No, never':0}

In [None]:
tinn_yes = filtered_for4[tin_cols].apply(find_yes, axis=1)

In [None]:
sum(tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report)

In [None]:
filtered_for4["tinnitus_4"] = tinn_yes | tinn_icd10 | tinn_icd9 | tinn_self_report

In [None]:
sum(filtered_for4["tinnitus_4"])

## 6.6. File Output

In [None]:
filtered

In [None]:
filtered_for4

In [None]:
filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)
#filtered[filtered["tinnitus_pure_ctrl"] == 0][["FID", "IID", "sex", "tinnitus_pure_ctrl", "ctrl_age"]].to_csv("tinnitus_pure_ctrl_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_1"] == 1][["FID", "IID", "sex", "tinnitus_1", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_1_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_2"] == 1][["FID", "IID", "sex", "tinnitus_2", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_2_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered[filtered["tinnitus_3"] == 1][["FID", "IID", "sex", "tinnitus_3", "tinnitus_age", "noise_wp", "loud_music"]].to_csv("tinnitus_3_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]].to_csv("tinnitus_4_pheno_file.tsv", sep='\t', index=False)

In [None]:
filtered_for4[filtered_for4["tinnitus_4"] == 1][["FID", "IID", "sex", "tinnitus_4", "tinnitus_age"]]

# 7. Merge Pheno with Ctrl

## 7.1. f2247, f2257, f3393, and mendilian

In [None]:
ctrl_file_name = "pure_ctrl_pheno_file.tsv"
f3393_file_name = "f3393_pheno_file.tsv"
f2247_file_name = "f2247_pheno_file.tsv"
f2257_file_name = "f2257_pheno_file.tsv"
f2247_f2257_file_name = "f2247_f2257_pheno_file.tsv"
mendilianlike_file_name = "mendelian_pheno_file.tsv"

In [None]:
f3393 = pd.read_csv(f3393_file_name, sep="\t")
f2247 = pd.read_csv(f2247_file_name, sep="\t")
f2257 = pd.read_csv(f2257_file_name, sep="\t")
f2247_f2257 = pd.read_csv(f2247_f2257_file_name, sep="\t")
ctrl = pd.read_csv(ctrl_file_name, sep="\t")
mendlike = pd.read_csv(mendilianlike_file_name, sep="\t")

In [None]:
print("ctrl: ",len(ctrl))
print("f2247: ",len(f2247))
print("f2257: ",len(f2257))
print("f2247_f2257: ",len(f2247_f2257))
print("f3393: ",len(f3393))
print("mendlike: ",len(mendlike))

In [None]:
mendlike[mendlike["mendelian_age"] > 40]

In [None]:
mendlike[mendlike["mendelian_age"] < 18]

### 7.1.1. f2247

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247.columns[3]})
f2247 = f2247.rename(columns={f2247.columns[4]:"age"})

full_pheno = f2247.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2247", "age"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_difficulty_f2247_expandedwhite_45502cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.2. f2257

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2257.columns[3]})
f2257 = f2257.rename(columns={f2257.columns[4]:"age"})

full_pheno = f2257.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2257", "age"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_noise_f2257_expandedwhite_65660cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.3. f2247_f2257

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f2247_f2257.columns[3]})
f2247_f2257 = f2247_f2257.rename(columns={f2247_f2257.columns[4]:"age"})

full_pheno = f2247_f2257.append(ctrl)
full_pheno[["FID", "IID", "sex", "f2247_f2257", "age"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Combined_f2247_f2257_expandedwhite_38410cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.4. f3393

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:f3393.columns[3]})
f3393 = f3393.rename(columns={f3393.columns[4]:"age"})

full_pheno = f3393.append(ctrl)
full_pheno[["FID", "IID", "sex", "f3393", "age"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl.keep_id", sep='\t', index=False, header=False)

### 7.1.5. Mendelian

In [None]:
# create pheno file
ctrl = ctrl.rename(columns={ctrl.columns[4]:"age", ctrl.columns[3]:mendlike.columns[3]})
mendlike = mendlike.rename(columns={mendlike.columns[4]:"age"})

full_pheno = mendlike.append(ctrl)
full_pheno[["FID", "IID", "sex", "mendelian", "age"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686cases_96601ctrl", sep='\t', index=False)

In [None]:
# create PCA pheno file
full_pheno[["FID", "IID", "ethnicity"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686cases_96601ctrl.phenopca", sep='\t', index=False)

In [None]:
# keep id file for genotype data selection
full_pheno[["FID", "IID"]].to_csv("090321_UKBB_Mendelian_expandedwhite_2686rcases_96601ctrl.keep_id", sep='\t', index=False, header=False)

## 7.2. Tinnitus

In [None]:
tinnitus_ctrl_file_name = "tinnitus_pure_ctrl_pheno_file.tsv"
tinnitus_1_file_name = "tinnitus_1_pheno_file.tsv"
tinnitus_2_file_name = "tinnitus_2_pheno_file.tsv"
tinnitus_3_file_name = "tinnitus_3_pheno_file.tsv"
tinnitus_4_file_name = "tinnitus_4_pheno_file.tsv"

In [None]:
tinnitus_ctrl = pd.read_csv(tinnitus_ctrl_file_name, sep="\t")
tinnitus_1 = pd.read_csv(tinnitus_1_file_name, sep="\t")
tinnitus_2 = pd.read_csv(tinnitus_2_file_name, sep="\t")
tinnitus_3 = pd.read_csv(tinnitus_3_file_name, sep="\t")
tinnitus_4 = pd.read_csv(tinnitus_4_file_name, sep="\t")

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[4]:"age"})
tinnitus_1 = tinnitus_1.rename(columns={tinnitus_1.columns[4]:"age"})
tinnitus_2 = tinnitus_2.rename(columns={tinnitus_2.columns[4]:"age"})
tinnitus_3 = tinnitus_3.rename(columns={tinnitus_3.columns[4]:"age"})
tinnitus_4 = tinnitus_4.rename(columns={tinnitus_4.columns[4]:"age"})

In [None]:
print("ctrl: ",len(tinnitus_ctrl))
print("tinnitus 1: ",len(tinnitus_1))
print("tinnitus 2: ",len(tinnitus_2))
print("tinnitus 3: ",len(tinnitus_3))
print("tinnitus 4: ",len(tinnitus_4))

### 7.2.1. Analysis 1

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_1.columns[3]})
full_tinnitus_1 = tinnitus_1.append(tinnitus_ctrl)
full_tinnitus_1

### 7.2.2. Analysis 2

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_2.columns[3]})
full_tinnitus_2 = tinnitus_2.append(tinnitus_ctrl)
full_tinnitus_2

### 7.2.3. Analysis 3

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_3.columns[3]})
full_tinnitus_3 = tinnitus_3.append(tinnitus_ctrl)
full_tinnitus_3

### 7.2.4. Analysis 4

In [None]:
tinnitus_ctrl = tinnitus_ctrl.rename(columns={tinnitus_ctrl.columns[3]:tinnitus_4.columns[3]})
full_tinnitus_4 = tinnitus_4.append(tinnitus_ctrl[tinnitus_ctrl.columns[:-2]])
full_tinnitus_4