In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

##### In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
##### we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters found
##### using the training and validation set. Finally, we evaluate 25 times our model on a separate testing set.

In [2]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [25]:
all_features = list(range(177))
eln_clin = [0]+list(range(168,175))
eln_clin_demo = [0]+list(range(168,177))
eln_clin_demo_cyto = [0]+list(range(84,153))+list(range(168,177))
eln_clin_demo_gen = list(range(84))+list(range(168,177))
eln_clin_demo_cyto_gen = list(range(153))+list(range(168,177))
eln_clin_demo_comp = [0]+list(range(153,177))
eln_cyto_gen = list(range(153))
eln_cyto_gen_comp = list(range(168))
eln_cyto_comp = [0] + list(range(84,168))
eln_gen_comp = list(range(84)) + list(range(153,168))

clin_demo = list(range(168,177))
clin_demo_cyto = list(range(84,153))+list(range(168,177))
clin_demo_gen = list(range(1,84))+list(range(168,177))
clin_demo_cyto_gen = list(range(1,153))+ list(range(168,177))
clin_demo_comp = list(range(153,177))
cyto_gen = list(range(1,153))
cyto_gen_comp = list(range(1,168))
cyto_comp = list(range(84,168))
gen_comp = list(range(1,84))+list(range(153,168))
clin_demo_cyto_gen_comp = list(range(1,177))
gen = list(range(1,84))
cyto = list(range(84,153))
comp = list(range(153,168))

dict_features_type = dict(zip(("all_features","eln_clin","eln_clin_demo",
         "eln_clin_demo_cyto","eln_clin_demo_gen","eln_clin_demo_cyto_gen",
         "eln_clin_demo_comp","eln_cyto_gen","eln_cyto_gen_comp",
         "eln_cyto_comp","eln_gen_comp","clin_demo",
         "clin_demo_cyto","clin_demo_gen","clin_demo_cyto_gen",
         "clin_demo_comp","cyto_gen","cyto_gen_comp",
         "cyto_comp","gen_comp","clin_demo_cyto_gen_comp",
         "gen","cyto","comp"), (all_features,eln_clin,eln_clin_demo,
         eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo_cyto_gen,
         eln_clin_demo_comp,eln_cyto_gen,eln_cyto_gen_comp,
         eln_cyto_comp,eln_gen_comp,clin_demo,
         clin_demo_cyto,clin_demo_gen,clin_demo_cyto_gen,
         clin_demo_comp,cyto_gen,cyto_gen_comp,
         cyto_comp,gen_comp,clin_demo_cyto_gen_comp,
         gen,cyto,comp)))

df_final = pd.read_table("df_prognosis_features_ready.tsv")

In [None]:
### new columns indexes for the dataset with final components for analysis for features containing the component

eln_clin_demo_comp = [0]+list(range(153,180))
eln_cyto_gen_comp = list(range(171))
eln_cyto_comp = [0] + list(range(84,171))
eln_gen_comp = list(range(84)) + list(range(153,171))
clin_demo_comp = list(range(153,180))
cyto_gen_comp = list(range(1,171))
cyto_comp = list(range(84,171))
gen_comp = list(range(1,84))+list(range(153,171))
clin_demo_cyto_gen_comp = list(range(1,180))
comp = list(range(153,171))
df_final = pd.read_table("df_prognosis_features_ready_final_component.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("eln_clin_demo_comp_final_comp","eln_cyto_gen_comp_final_comp","eln_cyto_comp_final_comp","eln_gen_comp_final_comp",
                                          "clin_demo_comp_final_comp","cyto_gen_comp_final_comp","cyto_comp_final_comp","gen_comp_final_comp",
                                          "clin_demo_cyto_gen_comp_final_comp","comp_final_comp"),
                                         (eln_clin_demo_comp,eln_cyto_gen_comp,eln_cyto_comp,eln_gen_comp,clin_demo_comp,cyto_gen_comp,
                                          cyto_comp,gen_comp,clin_demo_cyto_gen_comp,comp)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_some.csv")

In [None]:
eln_clin_gen = list(range(84))+list(range(153,160))  
eln_demo_gen = list(range(84))+[160,161] 
eln_clin_demo_cyto_gen =list(range(162)) 
eln_clin_demo_cyto = [0]+list(range(84,162))
eln_clin_demo_gen = list(range(84))+list(range(153,162))
eln_clin_demo = [0] + list(range(153,162))
eln_clin = [0] + list(range(153,160))
eln_cyto_gen = list(range(153))
clin_demo_cyto_gen = list(range(1,162))
clin_demo_cyto = list(range(84,162))
df_final = pd.read_table("prognosis_without_comp.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("eln_clin_gen","eln_demo_gen","eln_clin_demo_cyto_gen","eln_clin_demo_cyto","eln_clin_demo_gen","eln_clin_demo",
                                         "eln_clin","eln_cyto_gen","clin_demo_cyto_gen","clin_demo_cyto"),
                                         (eln_clin_gen,eln_demo_gen,eln_clin_demo_cyto_gen,eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo,
                                         eln_clin,eln_cyto_gen,clin_demo_cyto_gen,clin_demo_cyto)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_some.csv")

{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.6542196972101437]


In [45]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [52]:
pd.read_table("prognosis_without_comp.tsv")

Unnamed: 0,eln_2017_ratio,ASXL1,ASXL2,ASXL3,ATRX,BAGE3,BCOR,BRAF,CBFB,CBL,...,perf_status,bm_blasts,secondary,wbc,hb,plt,gender,age,os,os_status
PD14868a,3,0,0,0,0,0,0,0,0,0,...,1,89.000000,1,175.0,9.899994,199.0,0,79.712526,0.191650,1
PD14869c,2,0,0,0,0,0,0,0,0,0,...,1,90.000000,1,3.1,8.699997,23.0,1,65.043121,4.706366,0
PD14871a,3,0,0,0,0,0,0,0,0,0,...,0,62.000000,1,127.0,7.599998,57.0,1,79.657769,0.377823,1
PD14872a,1,0,0,0,0,0,1,0,0,0,...,2,40.000000,1,8.2,7.699997,124.0,0,66.505133,0.049281,1
PD14873a,1,1,0,0,0,0,0,0,0,0,...,1,25.000000,2,44.2,9.299995,40.0,1,71.780972,0.550308,1
PD14874a,1,0,0,0,0,0,0,0,0,0,...,0,56.000000,1,69.0,9.099998,63.0,1,75.014374,0.208077,1
PD14875a,3,0,0,0,0,0,0,0,0,0,...,2,70.000000,1,43.4,10.299995,64.0,0,74.795346,1.587953,1
PD14876c,1,0,0,0,0,0,0,0,0,0,...,1,90.000000,2,59.1,7.599998,104.0,1,65.659138,0.243669,1
PD14877c,3,0,0,0,0,0,0,0,0,0,...,1,92.000000,1,144.0,6.299999,47.0,0,84.104038,0.065708,1
PD14879a,1,0,0,0,0,0,0,0,0,0,...,0,84.000000,1,4.6,10.599998,25.0,1,76.501027,0.112252,1
