In [2]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
from lifelines.utils import k_fold_cross_validation
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt


In [3]:
genes = ["ACTN2","ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR"]

#"ACTN2","ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PLN", "PTPN11", "TNNI3", "TTR"


def create_model(gene, clintegrate = False, variant_data = False):
    
    
    # read in file
    file_name=gene+'.csv'
    phenotypic_data = pd.read_csv("/Users/uriel/Downloads/work_temp/lifelines_data/"+file_name, dtype={
            'is_family_hist':'object',
            'is_hcm':'object'
            })
    
    # drop unnecessary columns and rows; reformat is_hcm and is_family_hist columns to integer
    phenotypic_data = phenotypic_data.drop(['Unnamed: 0', 'birth_date', 'Name', 'death_age'], axis = 1)
    phenotypic_data=phenotypic_data.dropna()
    phenotypic_data["is_family_hist"] = np.where(phenotypic_data["is_family_hist"]=="True", 1, 0)
    phenotypic_data["is_hcm"] = np.where(phenotypic_data["is_hcm"]=="True", 1, 0)
    phenotypic_data = phenotypic_data.astype({"synonymous_variant":"boolean","missense_variant":"boolean", "deleterious":"boolean", "sex":"boolean", "is_family_hist":"boolean"})
    
    # hacks
    phenotypic_data = phenotypic_data[phenotypic_data['duration'] > 20]
    phenotypic_data.loc[((phenotypic_data["synonymous_variant"] == False) & (phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)), 'gnomADe_MAX_AF'] = 1

    

    if clintegrate == True:
        # keep only columns where p < 0.05, the column was a categorical variable related to functional consequence (synonymous_variant, missense_variant, deleterious), or the addition improved model concordance by at least 0.01 
        # (same for all tested genes)

        print("Clintegrate model")
        phenotypic_data = phenotypic_data.drop(['principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10','CADD_raw', 'phyloP100way_vertebrate', 'GERP++_RS', 'gnomADe_MAX_AF'], axis = 1)
    
    elif variant_data == True:
        # keep only columns related to variant information
        print("Variant model")
        phenotypic_data = phenotypic_data.drop(['sex', 'age', 'prs_score', 'is_family_hist', 'principal_component_1', 'principal_component_2','principal_component_3','principal_component_4', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10', 'gnomADe_MAX_AF'], axis = 1)

  
    # create testing and training set: training set 80% of data


    #variants_df = phenotypic_data[(phenotypic_data["synonymous_variant"] == True) | (phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == True)]
    #no_variants_df = phenotypic_data[(phenotypic_data["synonymous_variant"] == False) & (phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)]

    cph = CoxPHFitter()
    k_fold_cross_validation(cph, phenotypic_data, duration_col='duration', event_col='is_hcm', k=5)







    # build model for training set
    #cph.fit(phenotypic_data, 'duration', 'is_hcm', fit_options = {"step_size":0.1})
    #phenotypic_data['hazard_score'] = cph.predict_partial_hazard(phenotypic_data)
    #train, test = train_test_split(phenotypic_data, test_size=0.2)

    


    bottom_25_variant = phenotypic_data[(phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == phenotypic_data)]['hazard_score'].quantile(0.25)
    top_25_variant = phenotypic_data[(phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == True)]['hazard_score'].quantile(0.75)

    bottom_25_no_variant = phenotypic_data[(phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)]['hazard_score'].quantile(0.25)
    top_25_no_variant = phenotypic_data[(phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)]['hazard_score'].quantile(0.75)



    # predict hazard score for testing set
    phenotypic_data['hazard_score'] = cph.predict_partial_hazard(phenotypic_data)
  
    # split testing set based on hazard score
    lowest_25_variant = phenotypic_data[((phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == True)) & (test['hazard_score'] <= bottom_25_variant)]
    highest_25_variant = phenotypic_data[((phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == True)) & (phenotypic_data['hazard_score'] >= top_25_variant)]

    lowest_25_no_variant = phenotypic_data[((phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)) & (phenotypic_data['hazard_score'] <= bottom_25_no_variant)]
    highest_25_no_variant = phenotypic_data[((phenotypic_data["missense_variant"] == False) & (phenotypic_data["deleterious"] == False)) & (phenotypic_data['hazard_score'] >= top_25_no_variant)]

    print(bottom_25_variant, top_25_variant)
    print(bottom_25_no_variant, top_25_no_variant)

    print(len(lowest_25_variant), len(highest_25_variant), len(lowest_25_no_variant), len(highest_25_no_variant))

    
    # fit Kaplan-Meier curves for plotting
    kmf_lowest_25_variant = KaplanMeierFitter()
    kmf_highest_25_variant = KaplanMeierFitter()
    kmf_lowest_25_no_variant = KaplanMeierFitter()
    kmf_highest_25_no_variant = KaplanMeierFitter()

    kmf_lowest_25_variant.fit(durations=lowest_25_variant['duration'], event_observed=lowest_25_variant['is_hcm'], label = 'lowest 25 variant')
    kmf_highest_25_variant.fit(durations=highest_25_variant['duration'], event_observed=highest_25_variant['is_hcm'], label = 'highest 25 variant')
    kmf_lowest_25_no_variant.fit(durations=lowest_25_no_variant['duration'], event_observed=lowest_25_no_variant['is_hcm'], label = 'lowest 25 no variant')
    kmf_highest_25_no_variant.fit(durations=highest_25_no_variant['duration'], event_observed=highest_25_no_variant['is_hcm'], label = 'highest 25 no variant')

    plt.figure()
    plt.title(gene)


    kmf_lowest_25_variant.plot_survival_function()
    kmf_highest_25_variant.plot_survival_function()
    kmf_lowest_25_no_variant.plot_survival_function()
    kmf_highest_25_no_variant.plot_survival_function()


    # log-rank comparison
    print("25th vs 75th percentile for variants")
    results = logrank_test(lowest_25_variant['duration'], highest_25_variant['duration'], event_observed_A=lowest_25_variant['is_hcm'], event_observed_B=highest_25_variant['is_hcm'])
    results.print_summary()

    print("25th vs 75th percentile for no variant")
    results = logrank_test(lowest_25_no_variant['duration'], highest_25_no_variant['duration'], event_observed_A=lowest_25_no_variant['is_hcm'], event_observed_B=highest_25_no_variant['is_hcm'])
    results.print_summary()


    print("25th percentile for variant vs. no variant")
    results = logrank_test(lowest_25_variant['duration'], lowest_25_no_variant['duration'], event_observed_A=lowest_25_variant['is_hcm'], event_observed_B=lowest_25_no_variant['is_hcm'])
    results.print_summary()

    print("75th percentile for variant vs. no variant")
    results = logrank_test(highest_25_variant['duration'], highest_25_no_variant['duration'], event_observed_A=highest_25_variant['is_hcm'], event_observed_B=highest_25_no_variant['is_hcm'])
    results.print_summary()




for gene in genes:
    print(gene)

    try:
        create_model(gene, variant_data=True)
    except ValueError as ve:
        print("insufficient data")
        

plt.show()



ACTN2
Variant model


  bottom_25_variant = phenotypic_data[(phenotypic_data["missense_variant"] == True) | (phenotypic_data["deleterious"] == phenotypic_data)]['hazard_score'].quantile(0.25)


: 