In [216]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters find using the training and validation set. Finally, we evaluate 25 times our 
model on a separate testing set.

In [217]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [218]:
all_features = list(range(177))
eln_clin = [0]+list(range(168,175))
eln_clin_demo = [0]+list(range(168,177))
eln_clin_demo_cyto = [0]+list(range(84,153))+list(range(168,177))
eln_clin_demo_gen = list(range(84))+list(range(168,177))
eln_clin_demo_cyto_gen = list(range(153))+list(range(168,177))
eln_clin_demo_comp = [0]+list(range(153,177))
eln_cyto_gen = list(range(153))
eln_cyto_gen_comp = list(range(168))
eln_cyto_comp = [0] + list(range(84,168))
eln_gen_comp = list(range(84)) + list(range(153,168))

clin_demo = list(range(168,177))
clin_demo_cyto = list(range(84,153))+list(range(168,177))
clin_demo_gen = list(range(1,84))+list(range(168,177))
clin_demo_cyto_gen = list(range(1,153))+ list(range(168,177))
clin_demo_comp = list(range(153,177))
cyto_gen = list(range(1,153))
cyto_gen_comp = list(range(1,168))
cyto_comp = list(range(84,168))
gen_comp = list(range(1,84))+list(range(153,168))
clin_demo_cyto_gen_comp = list(range(1,177))
gen = list(range(1,84))
cyto = list(range(84,153))
comp = list(range(153,168))

dict_features_type = dict(zip(("all_features","eln_clin","eln_clin_demo",
         "eln_clin_demo_cyto","eln_clin_demo_gen","eln_clin_demo_cyto_gen",
         "eln_clin_demo_comp","eln_cyto_gen","eln_cyto_gen_comp",
         "eln_cyto_comp","eln_gen_comp","clin_demo",
         "clin_demo_cyto","clin_demo_gen","clin_demo_cyto_gen",
         "clin_demo_comp","cyto_gen","cyto_gen_comp",
         "cyto_comp","gen_comp","clin_demo_cyto_gen_comp",
         "gen","cyto","comp"), (all_features,eln_clin,eln_clin_demo,
         eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo_cyto_gen,
         eln_clin_demo_comp,eln_cyto_gen,eln_cyto_gen_comp,
         eln_cyto_comp,eln_gen_comp,clin_demo,
         clin_demo_cyto,clin_demo_gen,clin_demo_cyto_gen,
         clin_demo_comp,cyto_gen,cyto_gen_comp,
         cyto_comp,gen_comp,clin_demo_cyto_gen_comp,
         gen,cyto,comp)))

df_final = pd.read_table("df_prognosis_features_ready.tsv")

In [219]:
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)

In [220]:
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
#refit=True to have the best params for the prediction of next step

In [None]:
df=pd.DataFrame(columns=dict_features_type.keys())

for key,item in dict_features_type.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci


In [None]:
df.to_csv("SVM_different_features_type.csv")

In [None]:
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}

cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
df1=pd.DataFrame(columns=dict_features_type.keys())

for key,item in dict_features_type.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df1[key] = ci
df1.to_csv("SVM_bis")

In [42]:
gcv.p

{'alpha': array([3.16227766e-06, 1.00000000e-05])}

In [None]:
df=pd.DataFrame(columns=dict_features_type.keys())
for key,item in dict_features_type.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    gcv = gcv.fit(x,y)
    print (gcv.best_params_)
    ci=[]
    for i in range(2):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        model = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17, alpha=gcv.best_params_['alpha'], optimizer=gcv.best_params_['optimizer']).fit(X_train, y_train)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], model.predict(X_test))[0])
        print(ci)
    df[key] = ci