# Explore epilepsy in UKBiobank data

# 1.Read in data

fields needed:
```
131048	Date G40 first reported (epilepsy)
41271	ICD9 codes
41270	Diagnoses - ICD10
20002	Non-cancer illness code, self-reported
20003	Treatment/medication code
```

In [2]:
import pandas as pd
import numpy as np
# collect the necessary column names of the database for our analysis

with open("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv") as fp:
    line = fp.readline() # header
    header = line.split(",")
    
    indiv = ["IID", "FID"]
    icd10_colnames = [col.strip('"') for col in header if "f.41270." in col]
    icd10_ages = [col.strip('"') for col in header if "f.41280." in col]
    icd9_colnames = [col.strip('"') for col in header if "f.41271." in col]
    icd9_ages = [col.strip('"') for col in header if "f.41281." in col]
    f20002_colnames = [col.strip('"') for col in header if "f.20002." in col]
    reported_sex = ["f.31.0.0"]
    genetic_sex = ["f.22001.0.0"]
    ethnicity = [col.strip('"') for col in header if "f.21000." in col]
    year_of_birth = [col.strip('"') for col in header if "f.34." in col]
    month_of_birth = [col.strip('"') for col in header if "f.52." in col]
    
    date_g40_col = [col.strip('"') for col in header if "f.131048." in col]
    medication = [col.strip('"') for col in header if "f.20003." in col]
# combine the column names into one list
combined_cols = indiv + icd10_colnames + icd10_ages + icd9_colnames + icd9_ages + f20002_colnames \
                + ethnicity + reported_sex + genetic_sex + year_of_birth + month_of_birth \
                + date_g40_col + medication
# database of all individuals that we are working with and the selected phenotypes
df = pd.read_csv("/mnt/mfs/statgen/UKBiobank/phenotype_files/HI_UKBB/092821_UKBB_486416ind_call90.csv", quotechar = '"', dtype="string", usecols=combined_cols)
df

Unnamed: 0,IID,FID,f.31.0.0,f.34.0.0,f.52.0.0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,...,f.41281.0.38,f.41281.0.39,f.41281.0.40,f.41281.0.41,f.41281.0.42,f.41281.0.43,f.41281.0.44,f.41281.0.45,f.41281.0.46,f.131048.0.0
0,1000019,1000019,Female,1960,November,1111,,,,,...,,,,,,,,,,
1,1000022,1000022,Male,1954,August,1065,,,,,...,,,,,,,,,,
2,1000035,1000035,Male,1944,May,1396,1473,,,,...,,,,,,,,,,
3,1000046,1000046,Female,1946,March,1065,1294,1476,1473,1374,...,,,,,,,,,,
4,1000054,1000054,Female,1942,January,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486411,6025390,6025390,Female,1942,March,1464,,,,,...,,,,,,,,,,
486412,6025409,6025409,Female,1946,November,1478,1473,,,,...,,,,,,,,,,
486413,6025411,6025411,Female,1960,November,,,,,,...,,,,,,,,,,
486414,6025425,6025425,Female,1963,August,1265,,,,,...,,,,,,,,,,


# 2. Sample QC
## 2.1. Remove individuals that do not match for reported and genetic sex

In [3]:
reported_sex = ["f.31.0.0"]
genetic_sex = ["f.22001.0.0"]
# returns true only if 
def inconsistent_sexes(row):
    return pd.isna(row[genetic_sex[0]]) or row[reported_sex[0]] != row[genetic_sex[0]]
# exclusion based on inconsistent sex
ex_sex = df[reported_sex + genetic_sex].apply(inconsistent_sexes, axis=1)
filtered = df[~ex_sex]
filtered.shape

(486416, 884)

In [4]:
print(sum(ex_sex), "individuals removed because of inconsistency with the genetic and reported sex variables")

0 individuals removed because of inconsistency with the genetic and reported sex variables


In [5]:
print("Of these individuals", sum([1 for x in df[genetic_sex[0]].to_list() if pd.isna(x)]), "were NA for the genetic sex variable")

Of these individuals 0 were NA for the genetic sex variable


## 2.2. Remove non-white individuals

In [6]:
# set of answers for the ethnicity question
set(filtered[ethnicity[0]].to_list()).union( set(filtered[ethnicity[1]].to_list()) , set(filtered[ethnicity[2]].to_list()))

{<NA>,
 'African',
 'Any other Asian background',
 'Any other Black background',
 'Any other mixed background',
 'Any other white background',
 'Asian or Asian British',
 'Bangladeshi',
 'Black or Black British',
 'British',
 'Caribbean',
 'Chinese',
 'Do not know',
 'Indian',
 'Irish',
 'Mixed',
 'Other ethnic group',
 'Pakistani',
 'Prefer not to answer',
 'White',
 'White and Asian',
 'White and Black African',
 'White and Black Caribbean'}

In [7]:
# these should align with all possible options for ethnicity answers except for <NA>, Do not know, and Prefer not to answer
white = ['British', 'Irish', 'White','Any other white background']
african = ['Caribbean','White and Black Caribbean', 'African', 'White and Black African', 'Black or Black British', 'Any other Black background' ]
asian = ['Indian', 'Pakistani', 'White and Asian', 'Any other Asian background', 'Bangladeshi', 'Asian or Asian British']
mixed = ['Mixed', 'Any other mixed background']
chinese = ['Chinese']
other = ['Other ethnic group']

# figure out the ancestry of each individual
def ancestry(row):
    temp = [x for x in row[ethnicity] if not pd.isna(x) and x != "Prefer not to answer" and x != "Do not know"]
    if len(temp) == 0:
        return "Unknown"
    
    if len(set(temp)) == 1 and temp[0] in white: # if we have only one unique answer and the answer is in the white variable
        return "_".join(temp[0].split(" ")) # return the unique answer
    
    if len([x for x in temp if x in white]) == len(temp):
        return "Inconsistent_white"
    if len([x for x in temp if x in african]) == len(temp):
        return "African"
    if len([x for x in temp if x in asian]) == len(temp):
        return "Asian"
    if len([x for x in temp if x in mixed]) == len(temp):
        return "Mixed"
    if len([x for x in temp if x in chinese]) == len(temp):
        return "Chinese"
    if len([x for x in temp if x in other]) == len(temp):
        return "Other"
    return "Inconsistent"
filtered["ethnicity"] = filtered[ethnicity].apply(ancestry, axis=1)
def find_non_white(row):
    return row["ethnicity"] not in white and row["ethnicity"] != "Unknown" and row["ethnicity"] != "Inconsistent_white" and row["ethnicity"] != "Any_other_white_background"
ex_non_white = filtered[["ethnicity"]].apply(find_non_white, axis=1)
filtered = filtered[~ex_non_white]
filtered.shape

(460649, 885)

In [8]:
print(sum(ex_non_white), "individuals removed for being non-white")

25767 individuals removed for being non-white in the exome data


## 3. Identify Sex Column

In [9]:
# male is denoted a 0, female as 1
def find_sex(row):
    if row["f.31.0.0"] == "Male":
        return 0
    return 1

filtered["sex"] = filtered[["f.31.0.0"]].apply(find_sex, axis=1)
filtered["sex"].value_counts(dropna=False)

1    249900
0    210749
Name: sex, dtype: int64

# 4. Explore epilepsy
## 4.1 ICD9 codes

In [10]:
icd9_list = ["345","3450","3451","3452","3453","3454","3455","3457","3459"]
# returns if the current individual should be excluded based on the exclusion list
def contains_inclusion(row, inclusion_list):
    for i in row:
        if not pd.isna(i) and i in inclusion_list:
            return True
    return False
# collect the individuals that should be excluded because of icd9
in_fxn_icd9 = lambda row: contains_inclusion(row, icd9_list)
ep_icd9 = filtered[icd9_colnames].apply(in_fxn_icd9, axis=1)
filtered.loc[ep_icd9,icd9_colnames]

Unnamed: 0,f.41271.0.0,f.41271.0.1,f.41271.0.2,f.41271.0.3,f.41271.0.4,f.41271.0.5,f.41271.0.6,f.41271.0.7,f.41271.0.8,f.41271.0.9,...,f.41271.0.37,f.41271.0.38,f.41271.0.39,f.41271.0.40,f.41271.0.41,f.41271.0.42,f.41271.0.43,f.41271.0.44,f.41271.0.45,f.41271.0.46
10147,3459,5206,7295,,,,,,,,...,,,,,,,,,,
14149,3459,7832,,,,,,,,,...,,,,,,,,,,
17814,3459,6250,6268,,,,,,,,...,,,,,,,,,,
18835,3039,3050,3459,53019,5609,5680,5720,5770,5771,7832,...,,,,,,,,,,
22766,3459,7245,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450285,3459,4549,4709,V642,,,,,,,...,,,,,,,,,,
460854,3455,7803,7805,V718,,,,,,,...,,,,,,,,,,
460881,3459,78039,8510,8520,8540,E88890,E9689,V155,V584,,...,,,,,,,,,,
475372,2381,2809,28090,28099,31999,3451,34510,34519,3453,3459,...,,,,,,,,,,


In [11]:
def count_per_indiv(d, row):
    temp = set()
    for each in row:
        temp.add(each)
    for each in temp:
        if each in d.keys():
            d[each] += 1
        else:
            d[each] = 1

icd9_count_codes = dict()
icd9_fxn = lambda row: count_per_indiv(icd9_count_codes, row)
temp = filtered.loc[ep_icd9,icd9_colnames].apply(icd9_fxn, axis=1)
icd9_counts=pd.DataFrame([icd9_count_codes])
icd9_counts

Unnamed: 0,5206,7295,3459,<NA>,7832,6268,6250,E8786,9982,53019,...,V251,E9600,31999,5219,62602,V146,E9201,E91933,4781,9594
0,1,1,70,83,3,3,2,3,1,1,...,1,1,1,1,1,1,1,1,1,1


In [12]:
codelist=[]
for i in icd9_counts.columns:
    i=str(i)
    if not pd.isna(i) and i in icd9_list:
        codelist.append(i)
codelist

['3459', '3450', '3451', '3454', '3457', '3453', '3455', '3452']

In [15]:
icd9_counts[['3459', '3450', '3451', '3454', '3457', '3453', '3455', '3452']]

Unnamed: 0,3459,3450,3451,3454,3457,3453,3455,3452
0,70,1,16,4,1,4,2,1


## 4.2 ICD10 codes

### 4.2.1 G40 Epilepsy

In [16]:
def contains_icd10(row):
    for i in row:
        if not pd.isna(i) and "G40" in i:
            return True
    return False

ep_icd10 = filtered[icd10_colnames].apply(contains_icd10, axis=1)
filtered.loc[ep_icd10,icd10_colnames]

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
91,A099,B957,C711,C719,D332,D430,D432,E780,E835,F171,...,,,,,,,,,,
241,G409,H830,I690,I846,I849,J459,K210,K219,K296,K573,...,,,,,,,,,,
312,A083,A099,B972,F329,F419,G402,J128,K219,K30,K589,...,,,,,,,,,,
490,B968,D122,D125,D231,E119,E780,G409,H269,H353,I10,...,,,,,,,,,,
491,E780,G409,H549,H811,M169,S099,W189,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486225,B972,E119,E512,F03,F100,F101,F102,F103,F107,F329,...,,,,,,,,,,
486226,G406,S8221,S8260,V235,Z478,,,,,,...,,,,,,,,,,
486319,A099,C509,C795,E669,F329,F419,G409,H023,H024,H919,...,,,,,,,,,,
486324,C187,C787,D125,E119,E780,F329,G409,I10,J9691,K589,...,,,,,,,,,,


In [18]:
def count_per_indiv(d, row):
    temp = set()
    for each in row:
        temp.add(each)
    for each in temp:
        if each in d.keys():
            d[each] += 1
        else:
            d[each] = 1

idc10_count_codes = dict()
idc10_fxn = lambda row: count_per_indiv(idc10_count_codes, row)
temp = filtered.loc[ep_icd10,icd10_colnames].apply(idc10_fxn, axis=1)
idc10_counts=pd.DataFrame([idc10_count_codes])
idc10_counts

Unnamed: 0,Z538,C719,T857,R900,B957,L989,G403,Z858,G409,F171,...,M6223,L605,S601,M4794,W029,M0299,S837,D010,C161,S341
0,591,127,33,67,52,183,753,437,5518,725,...,1,1,1,1,1,1,1,1,1,1


In [20]:
codelist=[]
for i in idc10_counts.columns:
    i=str(i)
    if not pd.isna(i) and "G40" in i:
        codelist.append(i)
codelist

['G403',
 'G409',
 'G402',
 'G406',
 'G401',
 'G408',
 'G407',
 'G405',
 'G400',
 'G404',
 'G40']

In [22]:
idc10_counts[codelist]

Unnamed: 0,G403,G409,G402,G406,G401,G408,G407,G405,G400,G404,G40
0,753,5518,468,316,294,160,60,78,54,20,1


### 4.2.2 G41 Status epilepticus

In [23]:
def contains_icd10(row):
    for i in row:
        if not pd.isna(i) and "G41" in i:
            return True
    return False

ep_icd10 = filtered[icd10_colnames].apply(contains_icd10, axis=1)
filtered.loc[ep_icd10,icd10_colnames]

Unnamed: 0,f.41270.0.0,f.41270.0.1,f.41270.0.2,f.41270.0.3,f.41270.0.4,f.41270.0.5,f.41270.0.6,f.41270.0.7,f.41270.0.8,f.41270.0.9,...,f.41270.0.216,f.41270.0.217,f.41270.0.218,f.41270.0.219,f.41270.0.220,f.41270.0.221,f.41270.0.222,f.41270.0.223,f.41270.0.224,f.41270.0.225
2117,D259,D509,G35,G379,G409,G410,G561,H669,K449,M796,...,,,,,,,,,,
3315,F101,F103,F329,G419,I493,K210,K228,K297,K449,M542,...,,,,,,,,,,
7349,A047,A084,A099,A419,A490,B954,B956,B962,C442,C446,...,,,,,,,,,,
8351,A415,B964,D125,D126,E039,E669,E780,G410,G418,G919,...,,,,,,,,,,
9525,E780,F329,G403,G409,G419,G819,G838,I269,I620,I639,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479418,A084,A099,B349,B373,B968,C445,D123,D250,D259,D374,...,,,,,,,,,,
482989,C444,F329,F419,G409,G419,G473,G562,H269,H919,I10,...,,,,,,,,,,
483837,C710,F329,G419,G936,G941,I10,R104,R940,S5200,W189,...,,,,,,,,,,
485089,E039,F329,G401,G419,G560,G911,G919,I613,I802,J22,...,,,,,,,,,,


In [24]:
def count_per_indiv(d, row):
    temp = set()
    for each in row:
        temp.add(each)
    for each in temp:
        if each in d.keys():
            d[each] += 1
        else:
            d[each] = 1

icd10_count_codes = dict()
icd10_fxn = lambda row: count_per_indiv(icd10_count_codes, row)
temp = filtered.loc[ep_icd10,icd10_colnames].apply(icd10_fxn, axis=1)
icd10_counts=pd.DataFrame([icd10_count_codes])
icd10_counts

Unnamed: 0,G561,H669,M796,G409,K449,R42,N920,R55,Z512,Z867,...,I613,Q030,Z301,G911,M9075,M051,I741,J990,M0510,M0580
0,1,6,4,167,47,19,7,51,21,135,...,1,1,1,1,1,1,1,1,1,1


In [25]:
codelist=[]
for i in idc10_counts.columns:
    i=str(i)
    if not pd.isna(i) and "G41" in i:
        codelist.append(i)
codelist

['G410', 'G419', 'G418', 'G412', 'G411']

In [26]:
idc10_counts[codelist]

Unnamed: 0,G410,G419,G418,G412,G411
0,40,131,7,10,5


## 4.3 Non-cancer illness code

According to https://biobank.ndph.ox.ac.uk/showcase/coding.cgi?id=6, epilepsys is 1264

In [28]:
def contains_20002(row):
    for i in row:
        if not pd.isna(i) and "1264" in i:
            return True
    return False

ep_f20002 = filtered[f20002_colnames].apply(contains_20002, axis=1)
filtered.loc[ep_f20002,f20002_colnames]

Unnamed: 0,f.20002.0.0,f.20002.0.1,f.20002.0.2,f.20002.0.3,f.20002.0.4,f.20002.0.5,f.20002.0.6,f.20002.0.7,f.20002.0.8,f.20002.0.9,...,f.20002.3.24,f.20002.3.25,f.20002.3.26,f.20002.3.27,f.20002.3.28,f.20002.3.29,f.20002.3.30,f.20002.3.31,f.20002.3.32,f.20002.3.33
91,1264,99999,,,,,,,,,...,,,,,,,,,,
241,1111,1086,1264,1286,,,,,,,...,,,,,,,,,,
256,1264,1196,,,,,,,,,...,,,,,,,,,,
501,1264,,,,,,,,,,...,,,,,,,,,,
526,1264,1226,1387,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485885,1264,1138,,,,,,,,,...,,,,,,,,,,
485909,1264,,,,,,,,,,...,,,,,,,,,,
486226,1264,1469,,,,,,,,,...,,,,,,,,,,
486319,1065,1264,1086,,,,,,,,...,,,,,,,,,,
