# Generating and parsing phenotypic data

##### Updated 06/03/2024
##### Selin Kubali

#### Goal
Load non-genetic data from UKBiobank and combine into a single .csv file

#### Output
cardiomyopathy.csv with variables *eid,duration,age,sex,is_family_hist,is_HCM,is_AF,is_HTN*. Upload to *selected_genes/hcm/csv_files/*. Further description of variables available in UK BioBank showcase.
___

### Load packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

## Import data from UK Biobank

### Import disease data 

In [2]:
!dx extract_dataset record-GVgvPX8JxvFB8BFz2z0Zx1P6 --fields "participant.eid,participant.p20002_i0,participant.p20002_i1,participant.p20002_i2,participant.p20002_i3,participant.p41270,participant.p41271,participant.p41272,participant.p40001_i0,participant.p40001_i1"
hypertrophic_df_diseases = pd.read_csv("app41250_20230525165327.dataset.csv") # replace with appropriate dataset name

hypertrophic_df_diseases.rename(columns = { 
    'participant.eid':'eid','participant.p20002_i0':'self_report_1', 'participant.p20002_i1':'self_report_2','participant.p20002_i2':'self_report_3','participant.p20002_i3':'self_report_4','participant.p41270':'icd_10','participant.p41271':'icd_9','participant.p41272': 'hospitalization_record','participant.p40001_i0':'death_record_1','participant.p40001_i1':'death_record_2'
}, inplace = True)

hypertrophic_df_diseases = hypertrophic_df_diseases.set_index('eid')

!rm "app41250_20230525165327.dataset.csv"

### Import date data

In [3]:
!dx extract_dataset record-GVgvPX8JxvFB8BFz2z0Zx1P6 --fields "participant.eid,participant.p52,participant.p34,participant.p40007_i0,participant.p53_i0,participant.p131338,participant.p40000_i0"

hypertrophic_df_dates = pd.read_csv("app41250_20230525165327.dataset.csv") # replace with appropriate dataset name

hypertrophic_df_dates.rename(columns = { 
    'participant.eid':'eid','participant.p52':'birth_month','participant.p34':'birth_year','participant.p40007_i0':'death_age','participant.p53_i0':'date_attend_assessment_center','participant.p131338':'first_reported_hcm','participant.p40000_i0':'death_date'
}, inplace = True)

hypertrophic_df_dates = hypertrophic_df_dates.set_index('eid')

!rm "app41250_20230525165327.dataset.csv"

### Import phenotypic data

In [22]:
!dx extract_dataset record-GVgvPX8JxvFB8BFz2z0Zx1P6 --fields "participant.eid,participant.p31,participant.p20107_i0,participant.p20107_i1,participant.p20107_i2,participant.p20107_i3,participant.p20110_i0,participant.p20110_i1,participant.p20110_i2,participant.p20110_i3,participant.p20111_i0,participant.p20111_i1,participant.p20111_i2,participant.p20111_i3"
hypertrophic_df_phenotypic = pd.read_csv("app41250_20230525165327.dataset.csv") # replace with appropriate dataset name

hypertrophic_df_phenotypic.rename(columns = {
    'participant.eid':'eid','participant.p31':'sex','participant.p20107_i0':'family_hist_1',
    'participant.p20107_i1':'family_hist_2', 'participant.p20107_i2':'family_hist_3', 'participant.p20107_i3':'family_hist_4',
    'participant.p20110_i0':'family_hist_5','participant.p20110_i1':'family_hist_6','participant.p20110_i2':'family_hist_7',
    'participant.p20110_i3':'family_hist_8', 'participant.p20111_i0':'family_hist_9','participant.p20111_i1':'family_hist_10',
    'participant.p20111_i2':'family_hist_11','participant.p20111_i3':'family_hist_12'}, inplace = True)

hypertrophic_df_phenotypic = hypertrophic_df_phenotypic.set_index('eid')

!rm "app41250_20230525165327.dataset.csv"

## Cleaning data

### Clean phenotypic data

##### Combine family history fields to find all people with first-degree relatives with heart disease

In [23]:
family_hist_cols =  ['family_hist_1',
       'family_hist_2', 'family_hist_3', 'family_hist_4', 'family_hist_5',
       'family_hist_6', 'family_hist_7', 'family_hist_8', 'family_hist_9',
       'family_hist_10', 'family_hist_11', 'family_hist_12']

for col in family_hist_cols:
    hypertrophic_df_phenotypic[col] = hypertrophic_df_phenotypic[col].fillna("  ").str.slice(start=1,stop=-1).str.split(",")
  
# detect if any family_hist column contains 1
hypertrophic_df_phenotypic["is_family_hist"] = hypertrophic_df_phenotypic.filter(like="family_hist_").apply(lambda s: s.map({'1'}.issubset)).any(axis=1)

hypertrophic_df_phenotypic = hypertrophic_df_phenotypic.drop(family_hist_cols, axis=1)


### Clean disease data

##### Combine self-reported disease fields

In [6]:
self_report_cols =  ['self_report_1', 'self_report_2', 'self_report_3', 'self_report_4']
death_cols = ['death_record_1', 'death_record_2']

hypertrophic_df_diseases["icd_10"] = hypertrophic_df_diseases["icd_10"].fillna("  ").str.slice(start=1,stop=-1).str.split(",")
hypertrophic_df_diseases["icd_9"] = hypertrophic_df_diseases["icd_9"].fillna("  ").str.slice(start=1,stop=-1).str.split(",")
hypertrophic_df_diseases["hospitalization_record"] = hypertrophic_df_diseases["hospitalization_record"].fillna("  ").str.slice(start=1,stop=-1).str.split(",")
for col in self_report_cols:
    hypertrophic_df_diseases[col] = hypertrophic_df_diseases[col].fillna("  ").str.slice(start=1,stop=-1).str.split(",")
for col in death_cols:
    hypertrophic_df_diseases[col] = hypertrophic_df_diseases[col].fillna("  ").str.slice(start=1,stop=-1).str.split(",")

hypertrophic_df_diseases['self_report'] = hypertrophic_df_diseases.apply(lambda row: row['self_report_1'] + row['self_report_2'] + row['self_report_3'] + row['self_report_4'], axis=1)
hypertrophic_df_diseases['death_records'] = hypertrophic_df_diseases.apply(lambda row: row['death_record_1'] + row['death_record_2'], axis=1)



hypertrophic_df_diseases = hypertrophic_df_diseases.drop(['self_report_1', 'self_report_2', 'self_report_3', 'self_report_4', 'death_record_1', 'death_record_2'], axis=1)




#### Create codings for disease status

In [7]:

disease_data = {'Diseases': ['HCM', 'AF', 'HTN'], 
     'icd_10': [['"I421"', '"I422"'], 
                ['"I48"'], 
                ['"I10"', '"I11"', '"I12"', '"I13"', '"I15"']
               ], 
     'icd_9': [['"4251"'], 
               ['"4273"'], 
               ['"401"', '"402"', '"403"', '"404"', '"405"']], 
     'self_report': [['1588'],
                    ['1471', '1483'], # no code found for cardioversion
                    ['1065', '1072']], 
     'hospitalization_record': [[],
                    ['"K571"', '"K572"','"K574"','"K575"','"K576"','"K577"','"K621"','"K622"','"K623"'],
                    []],
    'death_records': [['"I421"', '"I422"'], 
                ['"I48"'], 
                ['"I10"', '"I11"', '"I12"', '"I13"', '"I15"']
               ]
               }

disease_data_df = pd.DataFrame(data=disease_data)

disease_data_df = disease_data_df.set_index("Diseases")

disease_data_df.to_csv("disease_data.csv")



#### Calculate presence or absence of disease in each patient

In [8]:
for index, row in disease_data_df.iterrows():
    for column_name, column_data in disease_data_df.items():
        hypertrophic_df_diseases["is_"+column_name] = hypertrophic_df_diseases[column_name].apply(lambda lst: any(code in lst for code in disease_data_df.loc[index,column_name]) if isinstance(lst, list) else False)
    hypertrophic_df_diseases["is_"+index] = np.where((hypertrophic_df_diseases["is_icd_10"] == True) | (hypertrophic_df_diseases["is_self_report"] == True) | (hypertrophic_df_diseases["is_icd_10"]) == True, True, False)
    hypertrophic_df_diseases = hypertrophic_df_diseases.drop(["is_self_report", "is_icd_10", "is_icd_9","is_hospitalization_record","is_death_records"], axis=1)

hypertrophic_df_diseases = hypertrophic_df_diseases.drop(["icd_9", "icd_10", "self_report","hospitalization_record","death_records"], axis=1)

icd_10 687
icd_9 1
self_report 130
hospitalization_record 0
death_records 0
icd_10 21777
icd_9 65
self_report 4881
hospitalization_record 4891
death_records 0
icd_10 159860
icd_9 0
self_report 140208
hospitalization_record 0
death_records 0


## Date and time

#### Convert birth dates into datetime format

In [9]:
# convert birth_month and birth_year to string, remove decimal, and zero-pad
hypertrophic_df_dates["birth_year"] = hypertrophic_df_dates["birth_year"].apply(lambda s: (str(s))[0:-2])
hypertrophic_df_dates["birth_month"] = hypertrophic_df_dates["birth_month"].apply(lambda s: (str(s))[0:-2].zfill(2))
hypertrophic_df_dates = hypertrophic_df_dates.reset_index()

In [10]:
def create_datetime(row):
    """ convert birth_month and birth_year to datetime"""
    i = row.name
   
    if row['birth_month'] in ['01','02','03','04','05','06','07','08','09','10','11','12']:
        return dt.strptime(hypertrophic_df_dates["birth_month"][i] + '-15-' + hypertrophic_df_dates["birth_year"][i], '%m-%d-%Y')
    else:

        hypertrophic_df_dates.drop([i])

hypertrophic_df_dates["birth_date"] = hypertrophic_df_dates.apply(create_datetime, axis = 1)
hypertrophic_df_dates = hypertrophic_df_dates.drop(["birth_month", "birth_year"], axis=1)

#### Convert dates to datetime format

In [11]:
def custom_strptime(row, col):
    i = row.name
    
    if pd.notnull(hypertrophic_df_dates[col][i]):
        if float(hypertrophic_df_dates[col][i][0:3]) < 1910:
            return dt.strptime(hypertrophic_df_dates[col][i], '%Y-%m-%d')
        else:
             return pd.NaT

    else:
        return pd.NaT

for col in ['first_reported_hcm' 'date_attend_assessment_center', 'death_date']:
    hypertrophic_df_dates[col] = hypertrophic_df_dates.apply(custom_strptime, axis = 1, args = (col,))
    


### Calculate duration - all cases

In [12]:
def find_years_to_diagnosis(row):
    """ subtract first reported diagnosis date from birth date and convert to years """
    i = row.name
    if pd.notnull(hypertrophic_df_dates['first_reported_hcm'][i]):
        return (hypertrophic_df_dates['first_reported_hcm'][i] - hypertrophic_df_dates['birth_date'][i])/pd.to_timedelta(365.25, unit='D')

    else:
        return pd.NaT


hypertrophic_df_dates['years_to_diagnosis'] = hypertrophic_df_dates.apply(find_years_to_diagnosis, axis = 1)


In [13]:
def find_duration(row, latest_date):
    """ find duration by setting as years_since_diagnosis if applicable, and current age or age at death if not"""
    i = row.name
    if pd.notnull(hypertrophic_df_dates['years_to_diagnosis'][i]):
        return hypertrophic_df_dates['years_to_diagnosis'][i]
    elif pd.notnull(hypertrophic_df_dates['death_age'][i]):
        return hypertrophic_df_dates['death_age'][i]
    else:
        return (latest_date - hypertrophic_df_dates['birth_date'][i])/pd.to_timedelta(365.25, unit='D')
    

latest_date = max(hypertrophic_df_dates.loc[hypertrophic_df_dates['first_reported_hcm'].notna()]['first_reported_hcm'])
hypertrophic_df_dates['duration'] = hypertrophic_df_dates.apply(find_duration, axis = 1, args = (latest_date,))


In [14]:
def find_age(row, latest_date):
    i = row.name

    """ find age by setting current age or age at death if not"""

    if pd.notnull(hypertrophic_df_dates['death_age'][i]):
        return hypertrophic_df_dates['death_age'][i]
    else:
        return (latest_date - hypertrophic_df_dates['birth_date'][i])/pd.to_timedelta(365.25, unit='D')

latest_date = max(hypertrophic_df_dates.loc[hypertrophic_df_dates['first_reported_hcm'].notna()]['first_reported_hcm'])
hypertrophic_df_dates['age'] = hypertrophic_df_dates.apply(find_age, axis = 1, args = (latest_date,))

### Clean, combine, and upload data

In [15]:
hypertrophic_df_dates = hypertrophic_df_dates.set_index(['eid'], drop=True)
hypertrophic_df_dates = hypertrophic_df_dates[hypertrophic_df_dates['duration'] > 10]
hypertrophic_df_dates = hypertrophic_df_dates.drop(['first_reported_hcm','death_age', 'date_attend_assessment_center', 'first_reported_hcm', 'death_date', 'birth_date', 'years_to_diagnosis'], axis = 1)
hypertrophic_df = hypertrophic_df_dates.join(hypertrophic_df_phenotypic).join(hypertrophic_df_diseases)
hypertrophic_df.to_csv("cardiomyopathy.csv")
!dx cd selected_genes/hcm/csv_files/
!dx upload cardiomyopathy.csv