In [141]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
import os
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sksurv.datasets import load_breast_cancer
from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import pickle
from datetime import datetime


cancers = ['BLCA', 'BRCA', 'HNSC', 'LAML', 'LGG', 'LUAD']
path_prefix = 'C:/Users/sharony/SurvivalAnalysis/'
if os.getlogin() =='meiry':
    path_prefix = 'D:/sharon/medical_genomics_data/'


train_portion = 0.85
test_portion = 0.15

In [142]:
def calc_time_to_event(x):
    #print(type(x) , x)
    assert x.vital_status in ['Dead', 'Alive']
    if x.vital_status=='Dead':
        try:
            assert isinstance(int(x.death_days_to), int)
        except:
            print(type(x.death_days_to), 'non int entry in x.death_days_to' , x.death_days_to, 'removing row',x.vital_status, x.death_days_to)        
            return None
        assert float(x.death_days_to) >= 0
        return int(x.death_days_to)
    try:
        assert isinstance(int(x.last_contact_days_to), int) 
    except :
        print(type(x.last_contact_days_to), 'non int entry in x.last_contact_days_to' , x.last_contact_days_to, 'removing row',x.vital_status, x.death_days_to)        
        return None
    if int(x.last_contact_days_to) < 0:
        print('negative entry in x.last_contact_days_to' , x.last_contact_days_to, 'fixing it')
    return abs(int(x.last_contact_days_to))


In [143]:
def cross_validation(cox_model,X,y):
    alphas = cox_model.alphas_
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    gcv = GridSearchCV(make_pipeline(cox_model),param_grid={"coxnetsurvivalanalysis__alphas": [[v] for v in alphas]},cv=cv,
    error_score=0.5,n_jobs=4).fit(X, y)
    cv_results = pd.DataFrame(gcv.cv_results_)
    return cv_results
    

In [144]:
def cox_dict(cv_results):
    score_cox_dict = {}
    alphas_num = cv_results.shape[0]
    scores = cv_results.mean_test_score
    alphas = cv_results.param_coxnetsurvivalanalysis__alphas
    for i in range(len(alphas)):
        alpha = alphas[i][0]
        score = scores[i]
        score_cox_dict[alpha] = score
    return score_cox_dict
    

In [147]:
def cox_model(ct, clinical_data, omics, comment):
    # Data preparation
    data = omics.copy()
    time_to_event = clinical_data.apply(calc_time_to_event, axis = 1).astype('int32').values
    event = (clinical_data['vital_status'].values == 'Dead')
    y_data = [ event,time_to_event]
    list_of_tuples = list(zip(event, time_to_event)) 
    df_y  = pd.DataFrame(list_of_tuples, columns = ['event', 'time_to_event'])
    #X_train, X_test, y_train, y_test = train_test_split(data, df_y, test_size = test_portion, random_state=42)
    y = df_y.to_numpy(copy=True)
    X = data.to_numpy()
    y2 = np.asarray([tuple(y[i,:]) for i in range(y.shape[0])],dtype=[('e.tdm', '?'), ('t.tdm', '<f8')])
    
    print("Cox results for cancer type",ct,"with",comment,"omics:\n")
    #Cox model
    cox = CoxnetSurvivalAnalysis(l1_ratio=1.0, alpha_min_ratio=1e-16, n_alphas = 1, alphas = [1e-16])
    cox.fit(X, y2) 
    cv_results_cox = cross_validation(cox,X,y2)
    cox_dict_no_regularization = cox_dict(cv_results_cox)
    print("Cox model no regularization:\n")
    print(cox_dict_no_regularization,"\n")   

    # Ridge
    cox_ridge = CoxnetSurvivalAnalysis(alphas=[0.05,0.1,0.3,0.5,0.7], alpha_min_ratio=1,  l1_ratio=1e-16, normalize=False)
    cox_ridge.fit(X, y2)
    cv_results_ridge = cross_validation(cox_ridge,X,y2)
    cox_dict_ridge = cox_dict(cv_results_ridge)
    print("Cox model ridge:\n")
    print(cox_dict_ridge,"\n")   
    
    n_alphas = 20
    
    #Lasso
    cox_lasso = CoxnetSurvivalAnalysis(l1_ratio=1.0,alpha_min_ratio=0.05, n_alphas = n_alphas)
    cox_lasso.fit(X, y2)
    cv_results_lasso = cross_validation(cox_lasso,X,y2)
    cox_dict_lasso = cox_dict(cv_results_lasso)
    print("Cox model lasso:\n")
    print(cox_dict_lasso,"\n")  
    
    #Elastic net
    cox_elastic_net = CoxnetSurvivalAnalysis(l1_ratio=0.5,alpha_min_ratio=0.05, n_alphas = n_alphas)
    cox_elastic_net.fit(X, y2)
    cv_results_elastic_net = cross_validation(cox_elastic_net,X,y2)
    cox_dict_elastic_net = cox_dict(cv_results_elastic_net)
    print("Cox model elastic net:\n")
    print(cox_dict_elastic_net,"\n")  
    
    
  

In [155]:
for ct in cancers:
    print(datetime.now(), ct, 'Entered')
    ct_path = f'{path_prefix}{ct}'
    if not os.path.exists(f'{ct}_clinical.pkl'):
        print(datetime.now(), ct, 'skipping because clinical data was not processed')
        continue
    clinical_data = pickle.load(open(f'{ct}_clinical.pkl', 'rb'))
    if not os.path.exists(f'{ct}_omics.pkl'):
        print(datetime.now(), 'Not found', 'merged', f'{ct}_omics.pkl')
    else:
        omics = pickle.load(open(f'{ct}_omics.pkl', 'rb'))
        cox_model(ct, clinical_data, omics, 'merged')
        print(datetime.now())
    if not os.path.exists(f'{ct}_exp_omics.pkl'):
        print(datetime.now(), 'Not found', 'exp', f'{ct}_omics.pkl')
   else:
    exp = pickle.load(open(f'{ct}_exp_omics.pkl', 'rb'))
    cox_model(ct, clinical_data, exp, 'exp')
    print(datetime.now())
    if not os.path.exists(f'{ct}_methy_omics.pkl'):
        print(datetime.now(), 'Not found', 'methy', f'{ct}_methy_omics.pkl')
    else:
        methy = pickle.load(open(f'{ct}_methy_omics.pkl', 'rb'))
        cox_model(ct, clinical_data, methy, 'methy')
        print(datetime.now())
    if not os.path.exists(f'{ct}_mirna_omics.pkl'):
        print(datetime.now(), 'Not found', 'mirna', f'{ct}_mirna_omics.pkl')
    else:
        mirna = pickle.load(open(f'{ct}_mirna_omics.pkl', 'rb'))
        cox_model(ct, clinical_data, mirna, 'mirna')
        print(datetime.now())
    print()
    print(datetime.now(), ct, 'DONE')
    print()


2020-12-14 18:31:26.905491 LAML Entered
Cox results for cancer type LAML with exp omics:

Cox model no regularization:

{1e-16: 0.5739921196056166} 

Cox model ridge:

{0.05: 0.6412346658665676, 0.1: 0.6371671092990111, 0.3: 0.6347534570233957, 0.5: 0.6307361430060817, 0.7: 0.6323456371155757} 

Cox model lasso:

{0.08899697208638058: 0.5148586204184363, 0.07601511696881874: 0.5417353311141655, 0.06492690562747203: 0.5401478256386232, 0.0554561150788974: 0.5444367849275824, 0.047366814572827105: 0.5685001402332691, 0.04045748822442025: 0.5718385193998691, 0.03455601496935136: 0.5555848780081909, 0.029515380785336595: 0.5422607666319322, 0.0252100163654886: 0.5482162707776205, 0.021532669009777614: 0.5558234037988639, 0.018391730809003425: 0.5777564168361715, 0.01570895656257242: 0.5906735121152299, 0.013417514580192936: 0.5969846584263762, 0.011460321810209988: 0.5988230949887392, 0.009788621820277997: 0.6063536954334501, 0.008360770205864478: 0.597160990964672, 0.007141197169397429: 0