In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

##### In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
##### we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters found
##### using the training and validation set. Finally, we evaluate 25 times our model on a separate testing set.

In [2]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [25]:
all_features = list(range(177))
eln_clin = [0]+list(range(168,175))
eln_clin_demo = [0]+list(range(168,177))
eln_clin_demo_cyto = [0]+list(range(84,153))+list(range(168,177))
eln_clin_demo_gen = list(range(84))+list(range(168,177))
eln_clin_demo_cyto_gen = list(range(153))+list(range(168,177))
eln_clin_demo_comp = [0]+list(range(153,177))
eln_cyto_gen = list(range(153))
eln_cyto_gen_comp = list(range(168))
eln_cyto_comp = [0] + list(range(84,168))
eln_gen_comp = list(range(84)) + list(range(153,168))

clin_demo = list(range(168,177))
clin_demo_cyto = list(range(84,153))+list(range(168,177))
clin_demo_gen = list(range(1,84))+list(range(168,177))
clin_demo_cyto_gen = list(range(1,153))+ list(range(168,177))
clin_demo_comp = list(range(153,177))
cyto_gen = list(range(1,153))
cyto_gen_comp = list(range(1,168))
cyto_comp = list(range(84,168))
gen_comp = list(range(1,84))+list(range(153,168))
clin_demo_cyto_gen_comp = list(range(1,177))
gen = list(range(1,84))
cyto = list(range(84,153))
comp = list(range(153,168))

dict_features_type = dict(zip(("all_features","eln_clin","eln_clin_demo",
         "eln_clin_demo_cyto","eln_clin_demo_gen","eln_clin_demo_cyto_gen",
         "eln_clin_demo_comp","eln_cyto_gen","eln_cyto_gen_comp",
         "eln_cyto_comp","eln_gen_comp","clin_demo",
         "clin_demo_cyto","clin_demo_gen","clin_demo_cyto_gen",
         "clin_demo_comp","cyto_gen","cyto_gen_comp",
         "cyto_comp","gen_comp","clin_demo_cyto_gen_comp",
         "gen","cyto","comp"), (all_features,eln_clin,eln_clin_demo,
         eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo_cyto_gen,
         eln_clin_demo_comp,eln_cyto_gen,eln_cyto_gen_comp,
         eln_cyto_comp,eln_gen_comp,clin_demo,
         clin_demo_cyto,clin_demo_gen,clin_demo_cyto_gen,
         clin_demo_comp,cyto_gen,cyto_gen_comp,
         cyto_comp,gen_comp,clin_demo_cyto_gen_comp,
         gen,cyto,comp)))

df_final = pd.read_table("df_prognosis_features_ready.tsv")

In [None]:
### new columns indexes for the dataset with final components for analysis for features containing the component

eln_clin_demo_comp = [0]+list(range(153,180))
eln_cyto_gen_comp = list(range(171))
eln_cyto_comp = [0] + list(range(84,171))
eln_gen_comp = list(range(84)) + list(range(153,171))
clin_demo_comp = list(range(153,180))
cyto_gen_comp = list(range(1,171))
cyto_comp = list(range(84,171))
gen_comp = list(range(1,84))+list(range(153,171))
clin_demo_cyto_gen_comp = list(range(1,180))
comp = list(range(153,171))
df_final = pd.read_table("df_prognosis_features_ready_final_component.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("eln_clin_demo_comp_final_comp","eln_cyto_gen_comp_final_comp","eln_cyto_comp_final_comp","eln_gen_comp_final_comp",
                                          "clin_demo_comp_final_comp","cyto_gen_comp_final_comp","cyto_comp_final_comp","gen_comp_final_comp",
                                          "clin_demo_cyto_gen_comp_final_comp","comp_final_comp"),
                                         (eln_clin_demo_comp,eln_cyto_gen_comp,eln_cyto_comp,eln_gen_comp,clin_demo_comp,cyto_gen_comp,
                                          cyto_comp,gen_comp,clin_demo_cyto_gen_comp,comp)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_some.csv")

In [18]:
all_features = list(range(181)) 
clin_demo_comp = list(range(153,181)) 
clin_demo_cyto_gen_comp = list(range(1,181)) 
comp = list(range(162,181)) 
cyto_comp = list(range(84,153))+list(range(162,181)) 
cyto_gen_comp = list(range(1,153))+list(range(162,181)) 
eln_clin_demo_comp = [1]+list(range(153,181)) 
eln_cyto_comp = [0]+list(range(84,153))+list(range(162,181)) 
eln_cyto_gen_comp = list(range(153))+list(range(162,181)) 
eln_gen_comp = list(range(84))+list(range(162,181)) 
gen_comp = list(range(1,84))+list(range(162,181))
clin_comp = list(range(152,160))+list(range(162,181)) 
clin_cyto_comp = list(range(84,160))+list(range(162,181)) 
clin_gen_comp = list(range(1,84))+list(range(153,160))+list(range(162,181)) 
eln_clin_comp = [0]+list(range(153,160))+list(range(162,181))

#Without age
all_features_without_age = list(range(161))+list(range(162,181))
clin_demo_comp_without_age = list(range(153,161))+list(range(162,181))
clin_demo_cyto_gen_comp_without_age = list(range(1,161))+list(range(162,181))
eln_clin_demo_comp_without_age = [0]+list(range(153,161))+list(range(162,181))
            
            
            
#eln_clin_gen = list(range(84))+list(range(153,160))  
#eln_demo_gen = list(range(84))+[160,161] 
#eln_clin_demo_cyto_gen =list(range(162)) 
#eln_clin_demo_cyto = [0]+list(range(84,162))
#eln_clin_demo_gen = list(range(84))+list(range(153,162))
#eln_clin_demo = [0] + list(range(153,162))
#eln_clin = [0] + list(range(153,160))
#eln_cyto_gen = list(range(153))
#clin_demo_cyto_gen = list(range(1,162))
#clin_demo_cyto = list(range(84,162))
#clin_demo_gen = list(range(1,84))+list(range(153,162)) 
#clin_demo = list(range(153,162)) 
#cyto_gen = list(range(1,153))
#cyto = list(range(84,153))
#gen = list(range(1,84))
#clin_gen = list(range(1,84)) + list(range(153,160))  
#clin_cyto = list(range(84,160))  
#demo_gen = list(range(1,84)) + [160,161]
#demo_cyto = list(range(84,153)) + [160,161]

###Without age:

#eln_demo_gen_without_age = list(range(84)) + [160]
#eln_clin_demo_cyto_gen_without_age = list(range(161))
#eln_clin_demo_cyto_without_age = [0] + list(range(84,161))
#eln_clin_demo_gen_without_age = list(range(84)) + list(range(153,161))
#eln_clin_demo_without_age = [0] + list(range(153,161))
#clin_demo_cyto_gen_without_age = list(range(1,161))
#clin_demo_cyto_without_age = list(range(84,161))
#clin_demo_gen_without_age = list(range(1,84)) + list(range(153,161)) 
#clin_demo_without_age = list(range(153,161))
#demo_gen_without_age = list(range(1,84)) + [160]
#demo_cyto_without_age = list(range(84,153)) + [160]
                         
df_final = pd.read_table("prognosis_comp_elli.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("all_features","clin_demo_comp","clin_demo_cyto_gen_comp","comp","cyto_comp","cyto_gen_comp","eln_clin_demo_comp","eln_cyto_comp","eln_cyto_gen_comp",
                                         "eln_gen_comp","gen_comp","clin_comp","clin_cyto_comp","clin_gen_comp","eln_clin_comp","all_features_without_age","clin_demo_comp_without_age",
                                          "clin_demo_cyto_gen_comp_without_age","eln_clin_demo_comp_without_age"),
                                         (all_features,clin_demo_comp,clin_demo_cyto_gen_comp,comp,cyto_comp,cyto_gen_comp,eln_clin_demo_comp,eln_cyto_comp,eln_cyto_gen_comp,
                                         eln_gen_comp,gen_comp,clin_comp,clin_cyto_comp,clin_gen_comp,eln_clin_comp,all_features_without_age,clin_demo_comp_without_age,
                                          clin_demo_cyto_gen_comp_without_age,eln_clin_demo_comp_without_age)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_comp_elli.csv")

{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478, 0.7342578710644677]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478, 0.7342578710644677, 0.7220006726565128]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478, 0.7342578710644677, 0.7220006726565128, 0.7258059161382424]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478, 0.7342578710644677, 0.7220006726565128, 0.7258059161382424, 0.7146138339006577]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7004633890881391, 0.6995711588312478, 0.7342578710644677, 0.7220006726565128, 0.7258059161382424, 0.7146138339006577, 0.7415874689111064]
{'alpha': 3.1622776601683795e-05, 'optimizer

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879, 0.7201454134988878]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879, 0.7201454134988878, 0.7088304306156846]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879, 0.7201454134988878, 0.7088304306156846, 0.7414227595407903]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879, 0.7201454134988878, 0.7088304306156846, 0.7414227595407903, 0.7287797581951888]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6977338369251278, 0.700499247927801, 0.7288387056471765, 0.7164272329793879, 0.720145

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6058574285079507, 0.6242679297212532, 0.6436469265367316, 0.6160572718973718, 0.6045782012549384]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6058574285079507, 0.6242679297212532, 0.6436469265367316, 0.6160572718973718, 0.6045782012549384, 0.6223685515053372]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6058574285079507, 0.6242679297212532, 0.6436469265367316, 0.6160572718973718, 0.6045782012549384, 0.6223685515053372, 0.6356381664552896]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6058574285079507, 0.6242679297212532, 0.6436469265367316, 0.6160572718973718, 0.6045782012549384, 0.6223685515053372, 0.6356381664552896, 0.6054156799202294]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6058574285079507, 0.6242679297212532, 0.6436469265367316, 0.6160572718973718, 0.6045782012549384, 0.6223685515053372, 0.6356381664552896, 0.6054156799202294, 0.627866029002445]
{'alpha': 0.1, 'optimizer': 'avltree'}
[0

In [19]:
df_final = pd.read_table("prognosis_comp_yanis.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("all_features_yanis","clin_demo_comp_yanis","clin_demo_cyto_gen_comp_yanis","comp_yanis","cyto_comp_yanis","cyto_gen_comp_yanis","eln_clin_demo_comp_yanis","eln_cyto_comp_yanis","eln_cyto_gen_comp_yanis",
                                         "eln_gen_comp_yanis","gen_comp_yanis","clin_comp_yanis","clin_cyto_comp_yanis","clin_gen_comp_yanis","eln_clin_comp_yanis","all_features_without_age_yanis","clin_demo_comp_without_age_yanis",
                                          "clin_demo_cyto_gen_comp_without_age_yanis","eln_clin_demo_comp_without_age_yanis"),
                                         (all_features,clin_demo_comp,clin_demo_cyto_gen_comp,comp,cyto_comp,cyto_gen_comp,eln_clin_demo_comp,eln_cyto_comp,eln_cyto_gen_comp,
                                         eln_gen_comp,gen_comp,clin_comp,clin_cyto_comp,clin_gen_comp,eln_clin_comp,all_features_without_age,clin_demo_comp_without_age,
                                          clin_demo_cyto_gen_comp_without_age,eln_clin_demo_comp_without_age)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_comp_yanis.csv")

{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6992414384105119]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591, 0.7327586206896551]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591, 0.7327586206896551, 0.7217604381876712]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591, 0.7327586206896551, 0.7217604381876712, 0.7257395172803027]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591, 0.7327586206896551, 0.7217604381876712, 0.7257395172803027, 0.714052017581546]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0.6988670912407591, 0.7327586206896551, 0.7217604381876712, 0.7257395172803027, 0.714052017581546, 0.7444369410175745]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6992414384105119, 0

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316, 0.6656862421097856]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316, 0.6656862421097856, 0.6734801442854084]


  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316, 0.6656862421097856, 0.6734801442854084, 0.699473389006606]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316, 0.6656862421097856, 0.6734801442854084, 0.699473389006606, 0.664887973058174]
{'alpha': 0.1, 'optimizer': 'avltree'}
[0.6364141301932904, 0.6608954459628125, 0.6870002498750625, 0.6640240875094092, 0.6608014342153316, 0.6656862421097856, 0.6734801442854084, 0.699473389006606, 0.664887973058174, 0.6669202123445052]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6514425365791728]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6514425365791728, 0.6639837424392742]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6514425365791728, 0.6639837424392742, 0.6988380809595203]
{'alpha': 3.1622776601683795

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6361126098962135, 0.6611354690050245, 0.6871720389805097, 0.6643283845032752, 0.6603532419242389]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6361126098962135, 0.6611354690050245, 0.6871720389805097, 0.6643283845032752, 0.6603532419242389, 0.6651574738094451]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6361126098962135, 0.6611354690050245, 0.6871720389805097, 0.6643283845032752, 0.6603532419242389, 0.6651574738094451, 0.6741554527037044]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6361126098962135, 0.6611354690050245, 0.6871720389805097, 0.6643283845032752, 0.6603532419242389, 0.6651574738094451, 0.6741554527037044, 0.6978842079022809]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6361126098962135, 0.6611354690050245, 0.6871720389805097, 0.6643283845032752, 0.6603532419242389, 0.6651574738094451, 0.6741554527037044, 0.6978842079022809, 0.6674099248027803]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[