In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split
from sklearn.model_selection import cross_validate

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

##### In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
##### we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters found
##### using the training and validation set. Finally, we evaluate 25 times our model on a separate testing set.

In [3]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [4]:
all_features = list(range(180)) 
clin_demo_comp = list(range(154,180)) 
clin_demo_cyto_gen_comp = list(range(1,180)) 
comp = list(range(163,180)) 
cyto_comp = list(range(85,154))+list(range(163,180)) 
cyto_gen_comp = list(range(1,154))+list(range(163,180)) 
eln_clin_demo_comp = [0]+list(range(154,180)) 
eln_cyto_comp = [0]+list(range(85,154))+list(range(163,180)) 
eln_cyto_gen_comp = list(range(154))+list(range(163,180)) 
eln_gen_comp = list(range(85))+list(range(163,180)) 
gen_comp = list(range(1,85))+list(range(163,180))
clin_comp = list(range(154,161))+list(range(163,180)) 
clin_cyto_comp = list(range(85,161))+list(range(163,180)) 
clin_gen_comp = list(range(1,85))+list(range(154,161))+list(range(163,180)) 
eln_clin_comp = [0]+list(range(154,161))+list(range(163,180))


#Without age
all_features_without_age = list(range(162))+list(range(163,180))
clin_demo_comp_without_age = list(range(154,162))+list(range(163,180))
clin_demo_cyto_gen_comp_without_age = list(range(1,162))+list(range(163,180))
eln_clin_demo_comp_without_age = [0]+list(range(154,162))+list(range(163,180))
            
            
eln_clin_gen = list(range(85))+list(range(154,161))  
eln_demo_gen = list(range(85))+[161,162] 
eln_clin_demo_cyto_gen =list(range(163)) 
eln_clin_demo_cyto = [0]+list(range(85,163))

eln_clin_demo_gen = list(range(85))+list(range(154,163))
eln_clin_demo = [0] + list(range(154,163))
eln_clin = [0] + list(range(154,161))
eln_cyto_gen = list(range(154))
clin_demo_cyto_gen = list(range(1,163))
clin_demo_cyto = list(range(85,163))
clin_demo_gen = list(range(1,85))+list(range(154,163)) 
clin_demo = list(range(154,163)) 
cyto_gen = list(range(1,154))
cyto = list(range(85,154))
gen = list(range(1,85))
clin_gen = list(range(1,85)) + list(range(154,161))  
clin_cyto = list(range(85,161))  
demo_gen = list(range(1,85)) + [161,162]
demo_cyto = list(range(85,154)) + [161,162]

###Without age:

eln_demo_gen_without_age = list(range(85)) + [161]
eln_clin_demo_cyto_gen_without_age = list(range(162))
eln_clin_demo_cyto_without_age = [0] + list(range(85,162))
eln_clin_demo_gen_without_age = list(range(85)) + list(range(154,162))
eln_clin_demo_without_age = [0] + list(range(154,162))
clin_demo_cyto_gen_without_age = list(range(1,162))
clin_demo_cyto_without_age = list(range(85,162))
clin_demo_gen_without_age = list(range(1,85)) + list(range(154,162)) 
clin_demo_without_age = list(range(154,162))
demo_gen_without_age = list(range(1,85)) + [161]
demo_cyto_without_age = list(range(85,154)) + [161]
gen_age = list(range(1,85)) + [162]
eln=[0]
eln_comp=[0] + list(range(163,180))
eln_age=[0,162]
eln_gen=[0] + list(range(1,85))
eln_cyto=[0] + list(range(85,154))


In [6]:



df_final = pd.read_table("prognosis_comp_final.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=50, iid=False, refit=True,
                   cv=cv)
#dict_features_type_final_comp = dict(zip(("all_features","clin_demo_comp","clin_demo_cyto_gen_comp","comp","cyto_comp","cyto_gen_comp","eln_clin_demo_comp","eln_cyto_comp","eln_cyto_gen_comp",
#                                         "eln_gen_comp","gen_comp","clin_comp","clin_cyto_comp","clin_gen_comp","eln_clin_comp","all_features_without_age","clin_demo_comp_without_age",
#                                          "clin_demo_cyto_gen_comp_without_age","eln_clin_demo_comp_without_age","eln_clin_gen","eln_demo_gen","eln_clin_demo_cyto_gen","eln_clin_demo_cyto",
#                                         "eln_clin_demo_gen","eln_clin_demo","eln_clin","eln_cyto_gen","clin_demo_cyto_gen","clin_demo_cyto","clin_demo_gen","clin_demo","cyto_gen","cyto","gen",
#                                          "clin_gen","clin_cyto","demo_gen","demo_cyto","eln_demo_gen_without_age","eln_clin_demo_cyto_gen_without_age","eln_clin_demo_cyto_without_age",
#                                          "eln_clin_demo_gen_without_age","eln_clin_demo_without_age","clin_demo_cyto_gen_without_age","clin_demo_cyto_without_age","clin_demo_gen_without_age",
#                                          "clin_demo_without_age","demo_gen_without_age","demo_cyto_without_age","gen_age"),
#                                         (all_features,clin_demo_comp,clin_demo_cyto_gen_comp,comp,cyto_comp,cyto_gen_comp,eln_clin_demo_comp,eln_cyto_comp,eln_cyto_gen_comp,
#                                         eln_gen_comp,gen_comp,clin_comp,clin_cyto_comp,clin_gen_comp,eln_clin_comp,all_features_without_age,clin_demo_comp_without_age,
#                                          clin_demo_cyto_gen_comp_without_age,eln_clin_demo_comp_without_age,eln_clin_gen,eln_demo_gen,eln_clin_demo_cyto_gen,eln_clin_demo_cyto,
#                                         eln_clin_demo_gen,eln_clin_demo,eln_clin,eln_cyto_gen,clin_demo_cyto_gen,clin_demo_cyto,clin_demo_gen,clin_demo,cyto_gen,cyto,gen,
#                                          clin_gen,clin_cyto,demo_gen,demo_cyto,eln_demo_gen_without_age,eln_clin_demo_cyto_gen_without_age,eln_clin_demo_cyto_without_age,
#                                          eln_clin_demo_gen_without_age,eln_clin_demo_without_age,clin_demo_cyto_gen_without_age,clin_demo_cyto_without_age,clin_demo_gen_without_age,
#                                          clin_demo_without_age,demo_gen_without_age,demo_cyto_without_age,gen_age)))
dict_features_type_final_comp = dict(zip(("eln","eln_comp","eln_age","eln_gen","eln_cyto"),(eln,eln_comp,eln_age,eln_gen,eln_cyto)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(x), y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
#df.to_csv("../ELN_Comparison/SVM_add_eln_comp.csv")

{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059, 0.6104357853264787]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059, 0.6104357853264787, 0.5918877195312241]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059, 0.6104357853264787, 0.5918877195312241, 0.6102481906209722]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059, 0.6104357853264787, 0.5918877195312241, 0.6102481906209722, 0.6207072620361372]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.5957247595772368, 0.6141069542676096, 0.6381028235882059, 0.6104357853264787, 0.5918877195312241, 0.61

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726, 0.6962585923203494, 0.684620869978607, 0.7165516240363836]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726, 0.6962585923203494, 0.684620869978607, 0.7165516240363836, 0.6856973434535104]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726, 0.6962585923203494, 0.684620869978607, 0.7165516240363836, 0.6856973434535104, 0.6907313948219559]
{'alpha': 1e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726, 0.6962585923203494, 0.684620869978607, 0.7165516240363836, 0.6856973434535104, 0.6907313948219559, 0.7097093535862891, 0.7124733053863798, 0.6848891140202301, 0.6790073546561348, 0.6968772577667175, 0.6727072774218227, 0.6821847389558233, 0.6799996809647626, 0.6984744862388367]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6943377662106833, 0.6901302525042404, 0.7056862193903048, 0.7084434407982191, 0.7006905481225723, 0.6913810767044516, 0.7288636700541894, 0.7188941169138726, 0.6962585923203494, 0.684620869978607, 0.7165516240363836, 0.6856973434535104, 0.6907313948219559, 0.7097093535862891, 0.7124733053863798, 0.6848891140202301, 0.6790073546561348, 0.6968772577667175, 0.6727072774218227, 0.6821847389558233, 0.6799996809647626, 0.6984744862388367, 0.6864

  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119]
{'alpha': 0.0031622776601683794, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119, 0.6334644238077928]


  self.best_estimator_.fit(X, y, **fit_params)


{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119, 0.6334644238077928, 0.6403159125722663]
{'alpha': 0.1, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119, 0.6334644238077928, 0.6403159125722663, 0.6162906643400224]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119, 0.6334644238077928, 0.6403159125722663, 0.6162906643400224, 0.6291270048747482]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.6366533736927241, 0.6071926562863119, 0.6334644238077928, 0.6403159125722663, 0.6162906643400224, 0.6291270048747482, 0.6263291339830441]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6086663281175612, 0.6294684289691811, 0.6642069590204898, 0.

In [7]:
# NOT NOW
df_final["NC_chr_splicing"] = df_final["final_component_chr_splicing"]+df_final["final_component_NC1"]+df_final["final_component_NC2"]+df_final["final_component_NC3"]+df_final["final_component_NC6"]
eln_comp_chr_merged = [0,163,164,165,166,167,168,169,170,171,176,178,179,183] 
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=50, iid=False, refit=True,
                   cv=cv)

#dict_features_type_final_comp = dict(zip(("eln_comp_chr_merged"),(eln_comp_chr_merged)))
#df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

#for key,item in dict_features_type_final_comp.items():
x = df_final.iloc[:,eln_comp_chr_merged]
y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
ci=[]
for i in range(25):
    X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(x), y, test_size=0.2, random_state=i)
    gcv = gcv.fit(X_train,y_train)
    print(gcv.best_params_)
    ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
    print(ci)
df["eln_comp_chr_merged"] = ci
df.to_csv("../ELN_Comparison/SVM_add_eln_comp.csv")

{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6175135684133685]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514, 0.6691576086956522]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514, 0.6691576086956522, 0.6341709508480277]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514, 0.6691576086956522, 0.6341709508480277, 0.6252946449321072]
{'alpha': 0.1, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514, 0.6691576086956522, 0.6341709508480277, 0.6252946449321072, 0.6329356555074523]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6175135684133685, 0.6306045380356514, 0.6691576086956522, 0.6341709508480277, 0.6252946449321072, 0.6329356555074523, 0.6472172351885098]
{'alpha': 3.162277660168379e-06, 'optimizer': 'avltree'}
[0.6175135

In [None]:
# BOOTSTRAPPING+FEATURES_REDUCED

from contextlib import suppress

dict_features_type_final_comp = dict(zip(("all_features","clin_demo_comp","clin_demo_cyto_gen_comp","comp","cyto_comp","cyto_gen_comp","eln_clin_demo_comp","eln_cyto_comp","eln_cyto_gen_comp",
                                         "eln_gen_comp","gen_comp","clin_comp","clin_cyto_comp","clin_gen_comp","eln_clin_comp","all_features_without_age","clin_demo_comp_without_age",
                                          "clin_demo_cyto_gen_comp_without_age","eln_clin_demo_comp_without_age","eln_clin_gen","eln_demo_gen","eln_clin_demo_cyto_gen","eln_clin_demo_cyto",
                                         "eln_clin_demo_gen","eln_clin_demo","eln_clin","eln_cyto_gen","clin_demo_cyto_gen","clin_demo_cyto","clin_demo_gen","clin_demo","cyto_gen","cyto","gen",
                                          "clin_gen","clin_cyto","demo_gen","demo_cyto","eln_demo_gen_without_age","eln_clin_demo_cyto_gen_without_age","eln_clin_demo_cyto_without_age",
                                          "eln_clin_demo_gen_without_age","eln_clin_demo_without_age","clin_demo_cyto_gen_without_age","clin_demo_cyto_without_age","clin_demo_gen_without_age",
                                          "clin_demo_without_age","demo_gen_without_age","demo_cyto_without_age","gen_age"),
                                         (all_features,clin_demo_comp,clin_demo_cyto_gen_comp,comp,cyto_comp,cyto_gen_comp,eln_clin_demo_comp,eln_cyto_comp,eln_cyto_gen_comp,
                                         eln_gen_comp,gen_comp,clin_comp,clin_cyto_comp,clin_gen_comp,eln_clin_comp,all_features_without_age,clin_demo_comp_without_age,
                                          clin_demo_cyto_gen_comp_without_age,eln_clin_demo_comp_without_age,eln_clin_gen,eln_demo_gen,eln_clin_demo_cyto_gen,eln_clin_demo_cyto,
                                         eln_clin_demo_gen,eln_clin_demo,eln_clin,eln_cyto_gen,clin_demo_cyto_gen,clin_demo_cyto,clin_demo_gen,clin_demo,cyto_gen,cyto,gen,
                                          clin_gen,clin_cyto,demo_gen,demo_cyto,eln_demo_gen_without_age,eln_clin_demo_cyto_gen_without_age,eln_clin_demo_cyto_without_age,
                                          eln_clin_demo_gen_without_age,eln_clin_demo_without_age,clin_demo_cyto_gen_without_age,clin_demo_cyto_without_age,clin_demo_gen_without_age,
                                          clin_demo_without_age,demo_gen_without_age,demo_cyto_without_age,gen_age)))
dict_features_type_final_comp = dict(zip(("eln_comp","eln_age","eln_gen","eln_cyto"),(eln,eln_comp,eln_age,eln_gen,eln_cyto)))
df_final = pd.read_table("prognosis_comp_final.tsv")
df = pd.DataFrame(columns=dict_features_type_final_comp.keys())
for key,item in dict_features_type_final_comp.items():
    
    print(key)
    
    ##############    FEATURES REDUCED DF   ##################                  
    with suppress(Exception):              
        tmp = pd.read_table("comparison_dataframes/" + key + "_bootstrap_glm_0.7.tsv")
        print('in')
        tmp1 = tmp[tmp.coef>0]
        tmp2 = tmp[tmp.coef<0]
        cols = tmp1[tmp1.coef>tmp1.coef.quantile(0.9)].feature.tolist()+tmp2[tmp2.coef<tmp2.coef.quantile(0.15)].feature.tolist()+tmp[tmp.n>tmp.n.quantile(0.85)].feature.tolist()
        x_reduced = df_final[cols]
        response = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
        ci=[]
        for i in range(10):
            X_train, X_test, y_train, y_test = train_test_split(x_reduced, response, test_size=0.2, random_state=i)
            gcv = gcv.fit(X_train,y_train)
            print(gcv.best_params_)
            ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
            print(ci)
        df[key] = ci
        
##############    BOOTSTRAPPING DF   ##################

    df_svm = pd.DataFrame()
    with suppress(Exception):
        for i in range(100):
            data = df_final.sample(frac=0.8,replace=True,random_state=i)
            x = data.iloc[:,item]
            y = np.array(list(zip(data.os_status, data.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
            est = FastSurvivalSVM(alpha=gcv.best_params_["alpha"], fit_intercept=False, max_iter=1000, optimizer=gcv.best_params_["optimizer"],
                    random_state=17, rank_ratio=1.0, timeit=False, tol=1e-06,
                    verbose=False).fit(x,y)
            df_svm = df_svm.append(pd.DataFrame({'feature': x.columns, 'coef': est.coef_}),ignore_index=True)

        df_svm = df_svm.groupby(['feature']).sum()
        df_svm["feature"] = df_svm.index
        df_svm["n"] = 100    
        del df_svm.index.name

        df_svm.to_csv("comparison_dataframes/" + key + "_bootstrap_SVM.csv")


df.to_csv("comparison_dataframes/SVM_comp_reduced_add.csv")