# Cox regression model
#### Selin Kubali
#### Updated 10-11-2023

Goal: Model all genes related to hypertrophic cardiomyopathy with a Cox regression model in three possible configurations: with all possible parameters, with parameters based on the procedure given in [Fife et al. 2021](https://www.medrxiv.org/content/10.1101/2021.08.12.21261563v1), and with only statistically significant parameters. 

Input: .csv files for each gene, found in /selected_genes/hcm/lifelines_data in the Cassa Lab project on UKBiobank. Instructions for how to generate these genes are found in the notebooks in /selected_genes/hcm/notebooks


In [2]:
import pandas as pd
from lifelines import CoxPHFitter
import numpy as np
from lifelines import KaplanMeierFitter
import os
os.chdir('/Users/uriel/Downloads/work_temp/gnomadAI_lifelines')

In [5]:
genes = ["ACTN2","ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PTPN11", "TNNI3", "TTR"]
def create_model(gene, min = False, clintegrate = False):
    


    phenotypic_data = 0
    
    # read in file
    file_name=gene+'.csv'
    phenotypic_data = pd.read_csv(file_name, dtype={
            'is_family_hist':'boolean',
            'is_hcm':'boolean'
            })


    
    # drop unnecessary columns and rows; reformat is_hcm and is_family_hist columns to integer
    phenotypic_data = phenotypic_data.drop(['Name', 'index'], axis = 1)
    phenotypic_data=phenotypic_data.dropna()
    


    # Change values in column A to value Y where condition X is met
    phenotypic_data.loc[phenotypic_data['deleterious'] == 1, 'am_pathogenicity'] = 1
    
    if clintegrate == True:
        # keep only columns where p < 0.05, the column was a categorical variable related to functional consequence (synonymous_variant, missense_variant, deleterious), or the addition improved model concordance by at least 0.01 
        # (same for all tested genes)

        print("Clintegrate model")
        phenotypic_data = phenotypic_data.drop(['principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10','CADD_raw', 'phyloP100way_vertebrate', 'GERP++_RS'], axis = 1)
  
    if min == True:
        # keep only columns where p < 0.05 
        # (same for all tested genes)
        print("Minimum model")
        phenotypic_data = phenotypic_data.drop(['principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10','CADD_raw', 'phyloP100way_vertebrate', 'GERP++_RS','trv', 'ds_ag', 'ds_al', 'ds_dg', 'ds_al', 'ds_dl', 'missense_variant', 'synonymous_variant', 'deleterious'], axis = 1)



    # fit coxph model
    cph = CoxPHFitter()
    cph.fit(phenotypic_data, 'duration', 'is_hcm', fit_options = {"step_size":0.1})
    cph.print_summary()





for gene in genes:
    print(gene)
    #create_model(gene)
    #create_model(gene, clintegrate=True)
    create_model(gene, min = True)



ACTN2
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8870.35
time fit was run,2023-12-06 19:46:03 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.02,<0.005,76.05
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.06
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.23
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.66,<0.005,45.58
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.31,<0.005,10.06
prs_score,34.91,1450000000000000.0,2.62,29.78,40.03,8590000000000.0,2.43e+17,0.0,13.35,<0.005,132.59
am_pathogenicity,0.13,1.14,0.59,-1.03,1.29,0.36,3.63,0.0,0.21,0.83,0.27

0,1
Concordance,0.72
Partial AIC,17754.71
log-likelihood ratio test,479.63 on 7 df
-log2(p) of ll-ratio test,327.93


ALPK3
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8858.84
time fit was run,2023-12-06 19:46:06 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.18,0.08,0.63,0.93,1.87,2.53,0.0,10.02,<0.005,76.16
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.71,<0.005,46.13
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.9,<0.005,74.28
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.69,<0.005,45.91
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.33,<0.005,10.18
prs_score,34.97,1540000000000000.0,2.62,29.84,40.1,9140000000000.0,2.59e+17,0.0,13.37,<0.005,133.01
am_pathogenicity,1.81,6.09,0.3,1.21,2.4,3.36,11.06,0.0,5.95,<0.005,28.45

0,1
Concordance,0.73
Partial AIC,17731.68
log-likelihood ratio test,502.65 on 7 df
-log2(p) of ll-ratio test,344.37


DES
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8869.36
time fit was run,2023-12-06 19:46:09 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.18,0.08,0.63,0.93,1.87,2.53,0.0,10.02,<0.005,76.12
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.08
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.23
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.69,<0.005,45.96
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.33,<0.005,10.19
prs_score,34.91,1450000000000000.0,2.62,29.78,40.03,8590000000000.0,2.43e+17,0.0,13.35,<0.005,132.6
am_pathogenicity,-2.15,0.12,2.04,-6.16,1.85,0.0,6.39,0.0,-1.05,0.29,1.77

0,1
Concordance,0.72
Partial AIC,17752.71
log-likelihood ratio test,481.62 on 7 df
-log2(p) of ll-ratio test,329.36


FLNC
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8868.35
time fit was run,2023-12-06 19:46:12 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.01,<0.005,75.98
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.08
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.24
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.65,<0.005,45.45
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.28,<0.005,9.93
prs_score,34.91,1450000000000000.0,2.62,29.78,40.03,8600000000000.0,2.43e+17,0.0,13.35,<0.005,132.6
am_pathogenicity,0.93,2.54,0.41,0.13,1.74,1.14,5.7,0.0,2.27,0.02,5.43

0,1
Concordance,0.72
Partial AIC,17750.69
log-likelihood ratio test,483.64 on 7 df
-log2(p) of ll-ratio test,330.80


MYBPC3
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8791.42
time fit was run,2023-12-06 19:46:15 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.19,0.08,0.63,0.94,1.88,2.55,0.0,10.1,<0.005,77.31
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.1,0.0,7.72,<0.005,46.26
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.84,<0.005,73.5
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.61,<0.005,45.07
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.08,<0.005,8.93
prs_score,35.08,1720000000000000.0,2.62,29.95,40.2,10200000000000.0,2.89e+17,0.0,13.41,<0.005,133.88
am_pathogenicity,3.48,32.51,0.2,3.1,3.87,22.09,47.86,0.0,17.65,<0.005,229.25

0,1
Concordance,0.74
Partial AIC,17596.83
log-likelihood ratio test,637.50 on 7 df
-log2(p) of ll-ratio test,440.79


MYH6
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8869.35
time fit was run,2023-12-06 19:46:18 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.01,<0.005,75.91
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.04
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.26
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.68,<0.005,45.82
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.33,<0.005,10.15
prs_score,34.89,1430000000000000.0,2.62,29.77,40.02,8480000000000.0,2.4e+17,0.0,13.34,<0.005,132.5
am_pathogenicity,-0.71,0.49,0.54,-1.78,0.36,0.17,1.43,0.0,-1.3,0.19,2.36

0,1
Concordance,0.72
Partial AIC,17752.70
log-likelihood ratio test,481.63 on 7 df
-log2(p) of ll-ratio test,329.36


MYH7
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8837.99
time fit was run,2023-12-06 19:46:21 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.0,<0.005,75.87
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.66,<0.005,45.6
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.86,<0.005,73.72
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.73,<0.005,46.43
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.35,<0.005,10.28
prs_score,34.83,1340000000000000.0,2.61,29.71,39.96,7990000000000.0,2.26e+17,0.0,13.32,<0.005,132.13
am_pathogenicity,2.2,9.04,0.21,1.79,2.61,6.0,13.64,0.0,10.5,<0.005,83.28

0,1
Concordance,0.74
Partial AIC,17689.97
log-likelihood ratio test,544.36 on 7 df
-log2(p) of ll-ratio test,374.17


PTPN11
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8868.50
time fit was run,2023-12-06 19:46:24 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.01,<0.005,76.01
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.05
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.17
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.63,<0.005,45.29
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.32,<0.005,10.11
prs_score,34.92,1460000000000000.0,2.62,29.79,40.05,8690000000000.0,2.47e+17,0.0,13.35,<0.005,132.63
am_pathogenicity,1.45,4.26,0.62,0.24,2.66,1.27,14.28,0.0,2.35,0.02,5.72

0,1
Concordance,0.72
Partial AIC,17751.00
log-likelihood ratio test,483.34 on 7 df
-log2(p) of ll-ratio test,330.58


TNNI3
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8867.78
time fit was run,2023-12-06 19:46:27 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.01,<0.005,75.96
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.68,<0.005,45.88
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.9,<0.005,74.28
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.66,<0.005,45.63
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.32,<0.005,10.14
prs_score,34.86,1380000000000000.0,2.61,29.74,39.99,8260000000000.0,2.32e+17,0.0,13.34,<0.005,132.46
am_pathogenicity,2.58,13.13,0.83,0.95,4.2,2.6,66.42,0.0,3.11,<0.005,9.08

0,1
Concordance,0.73
Partial AIC,17749.57
log-likelihood ratio test,484.77 on 7 df
-log2(p) of ll-ratio test,331.60


TTR
Minimum model


0,1
model,lifelines.CoxPHFitter
duration col,'duration'
event col,'is_hcm'
baseline estimation,breslow
number of observations,483663
number of events observed,723
partial log-likelihood,-8870.37
time fit was run,2023-12-06 19:46:30 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
sex,0.78,2.17,0.08,0.62,0.93,1.87,2.53,0.0,10.02,<0.005,76.06
is_family_hist,0.59,1.8,0.08,0.44,0.74,1.55,2.09,0.0,7.7,<0.005,46.06
age,-0.06,0.94,0.01,-0.08,-0.05,0.93,0.95,0.0,-9.89,<0.005,74.23
principal_component_1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,7.66,<0.005,45.6
principal_component_4,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-3.32,<0.005,10.1
prs_score,34.91,1440000000000000.0,2.62,29.78,40.03,8580000000000.0,2.43e+17,0.0,13.35,<0.005,132.59
am_pathogenicity,0.12,1.13,2.17,-4.14,4.38,0.02,80.22,0.0,0.06,0.95,0.07

0,1
Concordance,0.72
Partial AIC,17754.75
log-likelihood ratio test,479.58 on 7 df
-log2(p) of ll-ratio test,327.90
