## Look for HSPG2 rare variants and associated phenotypes

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    hearing_imp_f3393 = [col.strip('"') for col in header if "f.3393." in col]
    hearing_imp_f2247 = [col.strip('"') for col in header if "f.2247." in col]
    hearing_imp_f2257 = [col.strip('"') for col in header if "f.2257." in col]
    tin_cols = [col.strip('"') for col in header if "f.4803." in col]
    ages_f21003_col = [col.strip('"') for col in header if "f.21003." in col]
    ages_f131258_col = [col.strip('"') for col in header if 'f.131258.' in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]

In [3]:
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames + ethnicity + reported_sex + genetic_sex + hearing_imp_f3393 + hearing_imp_f2247 + hearing_imp_f2257 + tin_cols + ages_f21003_col + ages_f131258_col + year_of_birth + month_of_birth

In [4]:
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/082321_UKBB_exomes.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.2247.0.0,f.2247.1.0,f.2247.2.0,f.2247.3.0,f.2257.0.0,f.2257.1.0,f.2257.2.0,f.2257.3.0,f.3393.0.0,f.3393.1.0,f.3393.2.0,f.3393.3.0,f.4803.0.0,f.4803.1.0,f.4803.2.0,f.4803.3.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,f.20002.0.10,f.20002.0.11,f.20002.0.12,f.20002.0.13,f.20002.0.14,f.20002.0.15,f.20002.0.16,f.20002.0.17,f.20002.0.18,...,f.41281.0.8,f.41281.0.9,f.41281.0.10,f.41281.0.11,f.41281.0.12,f.41281.0.13,f.41281.0.14,f.41281.0.15,f.41281.0.16,f.41281.0.17,f.41281.0.18,f.41281.0.19,f.41281.0.20,f.41281.0.21,f.41281.0.22,f.41281.0.23,f.41281.0.24,f.41281.0.25,f.41281.0.26,f.41281.0.27,f.41281.0.28,f.41281.0.29,f.41281.0.30,f.41281.0.31,f.41281.0.32,f.41281.0.33,f.41281.0.34,f.41281.0.35,f.41281.0.36,f.41281.0.37,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131258.0.0
0,1000019,1000019,Female,1960,November,Yes,,,,Yes,,,,No,,,,,,,,1111,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1000035,1000035,Male,1944,May,No,,,,Yes,,,,No,,,,,,,,1396,1473,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1000078,1000078,Female,1955,June,No,No,No,,No,No,No,,,No,No,,,"No, never","No, never",,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1000081,1000081,Male,1942,February,No,,,,No,,,,No,,,,"No, never",,,,1075,1440,1473,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1000198,1000198,Female,1967,July,Yes,,,,Yes,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1969-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,6025295,6025295,Male,1961,April,No,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200615,6025319,6025319,Female,1953,March,No,,,,No,,,,No,,,,"No, never",,,,1065,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200616,6025346,6025346,Female,1954,October,No,,,,No,,,,,,,,,,,,1452,1265,1387,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200617,6025363,6025363,Male,1944,April,No,,,,No,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
# these are the columns that represent the icd10 columns in the database
icd10_colnames = [col for col in df if "f.41270" in col]

In [6]:
# get a dataframe that only contains the icd10 columns from the full database
icd10 = df[icd10_colnames]
icd10

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
0,E041,H738,M750,M754,M758,N898,N920,N946,R104,Z038,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,H269,K579,K590,K621,M5459,N40,R040,R31,R398,Z466,Z538,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,C19,C20,D037,D125,K635,L720,Z860,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E780,H251,H269,I10,I210,I219,I251,I252,I258,I259,I842,I849,K409,K573,K620,K632,K638,K649,M171,M175,M8796,Z539,Z824,Z861,Z864,Z922,Z955,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,D171,I10,J301,K409,N898,O800,Z370,Z721,Z822,Z861,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200614,D171,I845,I846,K318,K429,K529,K602,K610,K921,R073,R104,R221,Z871,Z880,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200615,E669,I10,K801,K802,K85,M179,M199,M233,M8956,N921,R102,T840,T848,Z501,Z507,Z904,Z966,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200616,O074,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
200617,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
inclusion_list = ["M62","M620", "M6200","M6201", "M6202", "M6203", "M6204", "M6205", "M6206", "M6207", "M6208", "M6209"]

In [11]:
# returns if the current individual should be excluded based on the exclusion list
def contains_inclusion(row, inclusion_list):
    for i in row:
        if not pd.isna(i) and i in inclusion_list:
            return True
            
    return False

In [12]:
# collect the individuals that should be excluded because of icd10
in_fxn_icd10 = lambda row: contains_inclusion(row, inclusion_list)
in_10 = icd10.apply(in_fxn_icd10, axis=1)

In [13]:
# remove them from the working database (which is now filtered. df remains unchanged)
filtered = df[in_10]

In [15]:
filtered[icd10_colnames]

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,f.41270.0.10,f.41270.0.11,f.41270.0.12,f.41270.0.13,f.41270.0.14,f.41270.0.15,f.41270.0.16,f.41270.0.17,f.41270.0.18,f.41270.0.19,f.41270.0.20,f.41270.0.21,f.41270.0.22,f.41270.0.23,f.41270.0.24,f.41270.0.25,f.41270.0.26,f.41270.0.27,f.41270.0.28,f.41270.0.29,f.41270.0.30,f.41270.0.31,f.41270.0.32,f.41270.0.33,f.41270.0.34,f.41270.0.35,f.41270.0.36,f.41270.0.37,f.41270.0.38,f.41270.0.39,...,f.41270.0.186,f.41270.0.187,f.41270.0.188,f.41270.0.189,f.41270.0.190,f.41270.0.191,f.41270.0.192,f.41270.0.193,f.41270.0.194,f.41270.0.195,f.41270.0.196,f.41270.0.197,f.41270.0.198,f.41270.0.199,f.41270.0.200,f.41270.0.201,f.41270.0.202,f.41270.0.203,f.41270.0.204,f.41270.0.205,f.41270.0.206,f.41270.0.207,f.41270.0.208,f.41270.0.209,f.41270.0.210,f.41270.0.211,f.41270.0.212,f.41270.0.213,f.41270.0.214,f.41270.0.215,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
4004,A499,C504,C508,C509,C773,C786,C787,C795,D618,D619,D649,D65,D696,E274,F419,L918,M6208,N131,O366,O40,O48,O622,O631,O758,R17,R18,R509,R945,Z115,Z370,Z421,Z511,Z513,Z515,Z853,Z901,Z923,Z926,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
13912,J459,K429,K430,K439,K801,K824,L729,M6208,O800,R11,Z302,Z370,Z881,Z888,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
18509,A419,B349,B978,B99,E780,F101,F329,F522,G473,H010,H538,I10,I200,I209,I219,I251,I252,I258,I259,I635,I638,I652,I678,I959,J181,J189,J22,J439,J440,J449,J841,K20,K409,K43,K432,K469,K529,K573,K579,K628,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
29867,C341,C349,C671,C672,C674,C678,C679,C771,C787,D090,E669,E780,I10,I846,J448,J449,J459,K295,K449,M6208,N328,N952,R001,R042,R074,R509,R51,T812,Y836,Z080,Z082,Z087,Z089,Z466,Z511,Z530,Z551,Z824,Z851,Z855,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46430,B07,B378,D619,D649,H001,I081,I480,J181,J189,J22,J459,J90,K219,K318,K43,K439,K449,K458,K559,K565,K590,K660,K769,M6208,N390,R194,R21,T812,Y834,Y95,Z864,Z904,Z980,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
51340,G439,L309,M6208,O021,O335,O342,O351,O360,O441,O469,O603,O721,T810,Y831,Z370,Z392,Z871,Z907,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
67936,B441,B449,B965,E271,E274,E669,F419,I10,J029,J22,J449,J459,J47,J82,J998,K219,K30,K409,K429,K590,K642,M6208,N62,R042,R060,R074,R14,U837,Z110,Z130,Z139,Z721,Z860,Z861,Z870,Z871,Z901,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
69773,D141,E780,H259,I10,I841,I849,K429,K625,M179,M2323,M6208,M6590,R300,R35,Z888,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
77677,E119,E149,E780,G560,G562,H728,I10,I209,K210,K219,K29,K296,K429,K449,K602,K628,M109,M232,M234,M2383,M4782,M6208,M754,R074,R104,R14,R798,Y838,Z532,Z721,Z800,Z864,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
91595,E119,E669,E780,E785,G562,G961,H024,H921,I10,I209,K429,M1399,M169,M471,M480,M4802,M4806,M503,M6208,N429,R074,Z130,Z512,Z864,Z874,Z888,Z922,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
