In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

##### In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
##### we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters found
##### using the training and validation set. Finally, we evaluate 25 times our model on a separate testing set.

In [2]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [None]:
all_features = list(range(180)) 
clin_demo_comp = list(range(154,180)) 
clin_demo_cyto_gen_comp = list(range(1,180)) 
comp = list(range(163,180)) 
cyto_comp = list(range(85,154))+list(range(163,180)) 
cyto_gen_comp = list(range(1,154))+list(range(163,180)) 
eln_clin_demo_comp = [0]+list(range(154,180)) 
eln_cyto_comp = [0]+list(range(85,154))+list(range(163,180)) 
eln_cyto_gen_comp = list(range(154))+list(range(163,180)) 
eln_gen_comp = list(range(85))+list(range(163,180)) 
gen_comp = list(range(1,85))+list(range(163,180))
clin_comp = list(range(154,161))+list(range(163,180)) 
clin_cyto_comp = list(range(85,161))+list(range(163,180)) 
clin_gen_comp = list(range(1,85))+list(range(154,161))+list(range(163,180)) 
eln_clin_comp = [0]+list(range(154,161))+list(range(163,180))


#Without age
all_features_without_age = list(range(162))+list(range(163,180))
clin_demo_comp_without_age = list(range(154,162))+list(range(163,180))
clin_demo_cyto_gen_comp_without_age = list(range(1,162))+list(range(163,180))
eln_clin_demo_comp_without_age = [0]+list(range(154,162))+list(range(163,180))
            
            
eln_clin_gen = list(range(85))+list(range(154,161))  
eln_demo_gen = list(range(85))+[161,162] 
eln_clin_demo_cyto_gen =list(range(163)) 
eln_clin_demo_cyto = [0]+list(range(85,163))

eln_clin_demo_gen = list(range(85))+list(range(154,163))
eln_clin_demo = [0] + list(range(154,163))
eln_clin = [0] + list(range(154,161))
eln_cyto_gen = list(range(154))
clin_demo_cyto_gen = list(range(1,163))
clin_demo_cyto = list(range(85,163))
clin_demo_gen = list(range(1,85))+list(range(154,163)) 
clin_demo = list(range(154,163)) 
cyto_gen = list(range(1,154))
cyto = list(range(85,154))
gen = list(range(1,85))
clin_gen = list(range(1,85)) + list(range(154,161))  
clin_cyto = list(range(85,161))  
demo_gen = list(range(1,85)) + [161,162]
demo_cyto = list(range(85,154)) + [161,162]

###Without age:

eln_demo_gen_without_age = list(range(85)) + [161]
eln_clin_demo_cyto_gen_without_age = list(range(162))
eln_clin_demo_cyto_without_age = [0] + list(range(85,162))
eln_clin_demo_gen_without_age = list(range(85)) + list(range(154,162))
eln_clin_demo_without_age = [0] + list(range(154,162))
clin_demo_cyto_gen_without_age = list(range(1,162))
clin_demo_cyto_without_age = list(range(85,162))
clin_demo_gen_without_age = list(range(1,85)) + list(range(154,162)) 
clin_demo_without_age = list(range(154,162))
demo_gen_without_age = list(range(1,85)) + [161]
demo_cyto_without_age = list(range(85,154)) + [161]
age = [162]
gen_age = list(range(1,85)) + [162]


df_final = pd.read_table("prognosis_comp_final.tsv")
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=50, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("all_features","clin_demo_comp","clin_demo_cyto_gen_comp","comp","cyto_comp","cyto_gen_comp","eln_clin_demo_comp","eln_cyto_comp","eln_cyto_gen_comp",
                                         "eln_gen_comp","gen_comp","clin_comp","clin_cyto_comp","clin_gen_comp","eln_clin_comp","all_features_without_age","clin_demo_comp_without_age",
                                          "clin_demo_cyto_gen_comp_without_age","eln_clin_demo_comp_without_age","eln_clin_gen","eln_demo_gen","eln_clin_demo_cyto_gen","eln_clin_demo_cyto",
                                         "eln_clin_demo_gen","eln_clin_demo","eln_clin","eln_cyto_gen","clin_demo_cyto_gen","clin_demo_cyto","clin_demo_gen","clin_demo","cyto_gen","cyto","gen",
                                          "clin_gen","clin_cyto","demo_gen","demo_cyto","eln_demo_gen_without_age","eln_clin_demo_cyto_gen_without_age","eln_clin_demo_cyto_without_age",
                                          "eln_clin_demo_gen_without_age","eln_clin_demo_without_age","clin_demo_cyto_gen_without_age","clin_demo_cyto_without_age","clin_demo_gen_without_age",
                                          "clin_demo_without_age","demo_gen_without_age","demo_cyto_without_age","age","gen_age"),
                                         (all_features,clin_demo_comp,clin_demo_cyto_gen_comp,comp,cyto_comp,cyto_gen_comp,eln_clin_demo_comp,eln_cyto_comp,eln_cyto_gen_comp,
                                         eln_gen_comp,gen_comp,clin_comp,clin_cyto_comp,clin_gen_comp,eln_clin_comp,all_features_without_age,clin_demo_comp_without_age,
                                          clin_demo_cyto_gen_comp_without_age,eln_clin_demo_comp_without_age,eln_clin_gen,eln_demo_gen,eln_clin_demo_cyto_gen,eln_clin_demo_cyto,
                                         eln_clin_demo_gen,eln_clin_demo,eln_clin,eln_cyto_gen,clin_demo_cyto_gen,clin_demo_cyto,clin_demo_gen,clin_demo,cyto_gen,cyto,gen,
                                          clin_gen,clin_cyto,demo_gen,demo_cyto,eln_demo_gen_without_age,eln_clin_demo_cyto_gen_without_age,eln_clin_demo_cyto_without_age,
                                          eln_clin_demo_gen_without_age,eln_clin_demo_without_age,clin_demo_cyto_gen_without_age,clin_demo_cyto_without_age,clin_demo_gen_without_age,
                                          clin_demo_without_age,demo_gen_without_age,demo_cyto_without_age,age,gen_age)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVM_comp.csv")

In [9]:
x = df_final.iloc[:,all_features]
y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
gcv = gcv.fit(X_train,y_train)

In [22]:
# BOOTSTRAPPING

x = df_final.iloc[:,all_features]
y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
df_svm = pd.DataFrame()
for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1
    est = FastSurvivalSVM(alpha=1, fit_intercept=False, max_iter=1000, optimizer=None,
            random_state=17, rank_ratio=1.0, timeit=False, tol=1e-06,
            verbose=False).fit(X_train,y_train)
    df_svm = df_svm.append(pd.DataFrame({'feature': X_train.columns, 'coef': est.coef_}),ignore_index=True)
df_svm.to_csv("SVM_bootstrap.csv")

In [39]:
df_final.columns[166]

'final_component_c3_trans11'

In [57]:
df_svm = pd.DataFrame()
df_svm = df_svm.append(pd.DataFrame({'feature': X_train.columns, 'coef': est.coef_}),ignore_index=True)
df_svm = df_svm.append(pd.DataFrame({'feature': X_train.columns, 'coef': est.coef_}),ignore_index=True)
df1.shape

feature    final_component_not_assigned
coef                         0.00777019
Name: 359, dtype: object

In [3]:
df_final = pd.read_table("prognosis_comp_final.tsv")

estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=50, iid=False, refit=True,
                   cv=cv)
dict_features_type_final_comp = dict(zip(("all_features"),
                                         (all_features)))
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())

for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(10):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    


ValueError: Expected 2D array, got 1D array instead:
array=[1. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.