# Cox regression model
#### Selin Kubali
#### Updated 10-11-2023

Goal: Model all genes related to hypertrophic cardiomyopathy with a Cox regression model in three possible configurations: with all possible parameters, with parameters based on the procedure given in [Fife et al. 2021](https://www.medrxiv.org/content/10.1101/2021.08.12.21261563v1), and with only statistically significant parameters. 

Input: .csv files for each gene, found in /selected_genes/hcm/lifelines_data in the Cassa Lab project on UKBiobank. Instructions for how to generate these genes are found in the notebooks in /selected_genes/hcm/notebooks


In [2]:
import pandas as pd
from lifelines import CoxPHFitter
import numpy as np
from lifelines import KaplanMeierFitter
import os
os.chdir('/Users/uriel/Downloads/work_temp/gnomadAI_lifelines')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/uriel/Downloads/work_temp/gnomadAI_lifelines'

In [1]:
genes = ["ACTN2","ALPK3", "DES", "FLNC", "MYBPC3", "MYH6", "MYH7", "PTPN11", "TNNI3", "TTR"]
def create_model(gene, min = False, clintegrate = False):
    


    phenotypic_data = 0
    
    # read in file
    file_name=gene+'.csv'
    phenotypic_data = pd.read_csv(file_name, dtype={
            'is_family_hist':'boolean',
            'is_hcm':'boolean'
            })


    
    # drop unnecessary columns and rows; reformat is_hcm and is_family_hist columns to integer
    phenotypic_data = phenotypic_data.drop(['Name', 'index'], axis = 1)
    phenotypic_data=phenotypic_data.dropna()
    


    # Change values in column A to value Y where condition X is met
    phenotypic_data.loc[phenotypic_data['deleterious'] == 1, 'am_pathogenicity'] = 1
    
    if clintegrate == True:
        # keep only columns where p < 0.05, the column was a categorical variable related to functional consequence (synonymous_variant, missense_variant, deleterious), or the addition improved model concordance by at least 0.01 
        # (same for all tested genes)

        print("Clintegrate model")
        phenotypic_data = phenotypic_data.drop(['principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10','CADD_raw', 'phyloP100way_vertebrate', 'GERP++_RS'], axis = 1)
  
    if min == True:
        # keep only columns where p < 0.05 
        # (same for all tested genes)
        print("Minimum model")
        phenotypic_data = phenotypic_data.drop(['principal_component_2','principal_component_3', 'principal_component_5', 'principal_component_6', 'principal_component_7', 'principal_component_8', 'principal_component_9', 'principal_component_10','CADD_raw', 'phyloP100way_vertebrate', 'GERP++_RS','trv', 'ds_ag', 'ds_al', 'ds_dg', 'ds_al', 'ds_dl', 'missense_variant', 'synonymous_variant', 'deleterious'], axis = 1)



    # fit coxph model
    cph = CoxPHFitter()
    cph.fit(phenotypic_data, 'duration', 'is_hcm', fit_options = {"step_size":0.1})
    cph.print_summary()





for gene in genes:
    print(gene)
    #create_model(gene)
    #create_model(gene, clintegrate=True)
    create_model(gene, min = True)



ACTN2


NameError: name 'pd' is not defined