# Hypertrophic Cardiomyopathy Genes Cross-Validation
##### Selin Kubali
##### 12/13/2023
## Goal
Find out whether we can distinguish the HCM risk of bottom 25% and top 25% of missense and deleterious variant carriers in key hypertrophic cardiomyopathy-related genes.

#### How the code functions
Use cross-validation to fit a Cox-PH model and predict hazard scores. Then isolate the bottom 25% and top 25% of carriers by hazard score and calculate whether there is a statistically significant difference in HCM between them use the Mann-Whitney U test.

Cross-validation is done by splitting on variant data, to ensure there are an equal number of variants in each fold and prevent overfitting on high-frequency variants.

#### Inputs
Lifelines files - from running generate_extracts_gnomAD.ipynb on UKBiobank in Cassa Lab Shared Project/selected_genes/hcm/notebooks. Stored in Cassa Lab Shared Project/selected_genes/hcm/lifelines_data. 
Variant data files - from running vep_processing.ipynb on UKBiobank in Cassa Lab Shared Project/selected_genes/hcm/notebooks. Stored in Cassa Lab Shared Project/selected_genes/hcm/parsed_vep_files

#### Note
Two HCM related genes - DES and PLN - were eliminated for having too few variants to converge.
PTPN11, TNNI3, and TTR each have few cases of HCM with missense or deleterious variants, which may harm convergence.

In [89]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from sklearn.model_selection import KFold
from statsmodels.stats.multitest import multipletests
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
from lifelines.statistics import logrank_test


In [131]:
genes = ["PTPN11"]
thresholds_list =  list(range(1, 101))
p_vals = {}

def cross_val(gene):
    cph = CoxPHFitter(penalizer=0.0000001)

    # load lifelines file
    file_name=gene+'.csv'
    lifelines_data = pd.read_csv("/Users/uriel/Downloads/work_temp/cross_val_lifelines/"+file_name, dtype={
            'is_family_hist':'boolean',
            'is_hcm':'boolean'
            })

    # load variant data file
    file_name=gene+'.csv'
    variant_data = pd.read_csv("/Users/uriel/Downloads/work_temp/variant_files/"+file_name)
    variant_data = variant_data[['Name']]
    variant_data['var_index'] = variant_data.index

    # set lifelines data index to variant data index
    lifelines_data = variant_data.merge(lifelines_data, how="outer")
    lifelines_data.set_index("var_index")


    # clean lifelines file; set pathogenicity for deleterious variants to 1
    lifelines_data = lifelines_data.drop(["Name", 'Carrier', 'index', 'sex', 'is_family_hist', 'age',
       'principal_component_1', 'principal_component_4', 'prs_score'], axis = 1)
    lifelines_data.loc[lifelines_data['deleterious'] == 1, 'am_pathogenicity'] = 1

    print(lifelines_data.columns)
                

    # clean lifelines file
    lifelines_data = lifelines_data.dropna()
    #lifelines_data['am_pathogenicity'] = lifelines_data['am_pathogenicity'].astype(float) 
    lifelines_data = lifelines_data.drop(["am_pathogenicity"], axis = 1)
    #lifelines_data = lifelines_data[['var_index', 'am_pathogenicity', 'duration', 'is_hcm']]

    # cross validation: split up phenotypic data file based on variant file index
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    testing_set = []
    for train_idx, test_idx in kf.split(variant_data):
            train = lifelines_data[lifelines_data['var_index'].isin(train_idx)]
            test = lifelines_data[lifelines_data['var_index'].isin(test_idx)]

            train = train.drop(['var_index'], axis=1)
            test = test.drop(['var_index'], axis=1)

            # fit CPH and add hazard scores
            cph.fit(train, duration_col="duration", event_col="is_hcm", fit_options = {"step_size":0.1})
            hazard_scores_fold = cph.predict_partial_hazard(test)
            test['hazard'] = hazard_scores_fold
            testing_set.append(test)

    # create new lifelines_data df by joining all testing sets
    lifelines_data = pd.concat([df for idx, df in enumerate(testing_set)])
    


    # filter for patients with lowest 25% and highest 25% hazard scores

    for i in thresholds_list:
        percentiles = np.percentile(lifelines_data['hazard'], [i])
        bottom = lifelines_data[lifelines_data['hazard'] < percentiles[0]]
        top = lifelines_data[lifelines_data['hazard'] >= percentiles[0]]
        bottom.loc[:,'is_hcm'] = np.where(bottom['is_hcm'] == True, 1, 0)
        top.loc[:,'is_hcm'] = np.where(top['is_hcm'] == True, 1, 0)




        result = logrank_test(bottom['duration'], top['duration'], event_observed_A=bottom['is_hcm'], event_observed_B=top['is_hcm'])
        p_vals.update({i:result.p_value})
    """dfA = pd.DataFrame({'E': bottom['is_hcm'], 'T': bottom['duration'], 'is_highest': 0})
    dfB = pd.DataFrame({'E': top['is_hcm'], 'T': top['duration'], 'is_highest': 1})
    df = pd.concat([dfA, dfB])
    print(df)

    cph = CoxPHFitter().fit(df, 'T', 'E', fit_options = {"step_size":0.1})
    cph.print_summary()

    kmf_lowest_25_variant = KaplanMeierFitter()
    kmf_lowest_25_variant.fit(durations=bottom['duration'], event_observed=bottom['is_hcm'], label = 'bottom')
    kmf_lowest_25_variant.plot_survival_function()


    kmf_highest_25_variant = KaplanMeierFitter()
    kmf_highest_25_variant.fit(durations=top['duration'], event_observed=top['is_hcm'], label = 'top')
    kmf_highest_25_variant.plot_survival_function()



    plt.title(gene)
    plt.figure()"""




for gene in genes:
    cross_val(gene)



Index(['var_index', 'is_hcm', 'duration', 'am_pathogenicity', 'GERP++_RS',
       'trv', 'CADD_raw', 'phyloP100way_vertebrate', 'deleterious',
       'missense_variant'],
      dtype='object')


In [132]:
p_adjusted = multipletests(list(p_vals.values()), alpha=0.05, method='bonferroni')
updated_dict = {key: new_p_val for key, new_p_val in zip(p_vals.keys(), p_adjusted[1])}
print("P-values: ", updated_dict)

print(min(updated_dict.values()))



P-values:  {1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0, 10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, 18: 1.0, 19: 1.0, 20: 1.0, 21: 1.0, 22: 1.0, 23: 1.0, 24: 1.0, 25: 1.0, 26: 1.0, 27: 1.0, 28: 1.0, 29: 1.0, 30: 1.0, 31: 1.0, 32: 1.0, 33: 1.0, 34: 1.0, 35: 1.0, 36: 1.0, 37: 1.0, 38: 1.0, 39: 1.0, 40: 1.0, 41: 1.0, 42: 1.0, 43: 1.0, 44: 1.0, 45: 1.0, 46: 1.0, 47: 1.0, 48: 1.0, 49: 1.0, 50: 1.0, 51: 1.0, 52: 1.0, 53: 1.0, 54: 1.0, 55: 1.0, 56: 1.0, 57: 1.0, 58: 1.0, 59: 1.0, 60: 1.0, 61: 1.0, 62: 1.0, 63: 1.0, 64: 1.0, 65: 1.0, 66: 1.0, 67: 1.0, 68: 1.0, 69: 1.0, 70: 1.0, 71: 1.0, 72: 1.0, 73: 1.0, 74: 1.0, 75: 1.0, 76: 1.0, 77: 1.0, 78: 1.0, 79: 1.0, 80: 1.0, 81: 1.0, 82: 1.0, 83: 1.0, 84: 1.0, 85: 1.0, 86: 1.0, 87: 1.0, 88: 1.0, 89: 1.0, 90: 1.0, 91: 1.0, 92: 0.9920132948937195, 93: 0.41996046275585547, 94: 0.41996046275585547, 95: 1.0, 96: 1.0, 97: 1.0, 98: 1.0, 99: 1.0, 100: 1.0}
0.41996046275585547
