In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split
from sklearn.model_selection import cross_validate

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

##### In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
##### we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters found
##### using the training and validation set. Finally, we evaluate 25 times our model on a separate testing set.

In [None]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split
from sklearn.model_selection import cross_validate

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]
    
df_final = pd.read_table("full_data_validation.tsv",sep=" ")

eln = [113,114,115]
comp =list(range(89,113)) 
#comp_overlap = list(range(167,197))
age = [83]

all_gen = list(range(0,57))
tmp = df_final.iloc[:,all_gen][df_final.iloc[:,all_gen] >0].count()
gen = [df_final.columns.get_loc(c) for c in tmp[tmp>df_final.shape[0]*0.02].keys() if c in df_final]

all_cyto = list(range(57,80))
tmp = df_final.iloc[:,all_cyto][df_final.iloc[:,all_cyto] >0].count()
cyto = [df_final.columns.get_loc(c) for c in tmp[tmp>df_final.shape[0]*0.02].keys() if c in df_final]

clin=list(range(84,88))
demo=[82,83]
demo_without_age = [82]

eln_comp = eln + comp
eln_gen = eln + gen
eln_cyto = eln + cyto
eln_clin = eln + clin
eln_demo = eln + demo

# USEFUL FOR ELN COMPARISON
# with comp
eln_comp_gen = eln_comp + gen
eln_comp_cyto = eln_comp + cyto
eln_comp_clin = eln_comp + clin
eln_comp_demo = eln_comp + demo




eln_comp_gen_cyto = eln_comp_gen + cyto
eln_comp_gen_clin = eln_comp_gen + clin
eln_comp_gen_demo = eln_comp_gen + demo

eln_comp_cyto_clin = eln_comp_cyto + clin
eln_comp_cyto_demo = eln_comp_cyto + demo


eln_comp_clin_demo = eln_comp_clin + demo


eln_comp_gen_cyto_clin_demo = eln_comp_gen_cyto + clin + demo
eln_comp_gen_cyto_clin_demo_without_age = eln_comp_gen_cyto + clin + demo_without_age
# without comp


eln_gen_cyto = eln_gen + cyto
eln_gen_clin = eln_gen + clin
eln_gen_demo = eln_gen + demo


eln_cyto_clin = eln_cyto + clin
eln_cyto_demo = eln_cyto + demo

eln_clin_demo = eln_clin + demo
eln_clin_demo_without_age = eln_clin + demo_without_age


eln_gen_cyto_clin_demo = eln_gen_cyto + clin + demo

# USEFUL FOR COMP

comp_gen = comp + gen
comp_cyto = comp + cyto
comp_clin = comp + clin
comp_demo = comp + demo
comp_gen_cyto = comp_gen + cyto
comp_clin_demo = comp_clin + demo
comp_gen_cyto_clin_demo = comp_gen_cyto + clin + demo

#USEFUL FOR GEN
gen_cyto = gen + cyto
gen_clin = gen + clin
gen_demo = gen + demo
gen_clin_demo = gen_clin + demo
gen_cyto_clin_demo = gen_cyto + clin + demo

#USEFUL FOR CYTO 
cyto_clin = cyto + clin
cyto_demo = cyto + demo
gen_demo_without_age = gen + demo_without_age
cyto_clin_demo = cyto_clin + demo
cyto_gen_demo = gen_cyto + demo


clin_demo  = clin + demo



dict_features_type_final_comp = dict(zip(("gen_cyto_clin_demo","clin","eln_clin_demo","comp_clin_demo","eln_comp_gen_cyto_clin_demo"),
                                         (gen_cyto_clin_demo,clin,eln_clin_demo,comp_clin_demo,eln_comp_gen_cyto_clin_demo)))
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5,-4,-3,-2,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=-1, iid=False, refit=True,
                   cv=cv)
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())
for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.OS_Status, df_final.OS)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(x), y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVMs.csv")

In [None]:
df_final = pd.read_table("full_data_validation.tsv",sep=" ")

eln = [113,114,115]
comp =list(range(89,113)) 
#comp_overlap = list(range(167,197))
age = [83]

all_gen = list(range(0,57))
tmp = df_final.iloc[:,all_gen][df_final.iloc[:,all_gen] >0].count()
gen = [df_final.columns.get_loc(c) for c in tmp[tmp>df_final.shape[0]*0.02].keys() if c in df_final]

all_cyto = list(range(57,80))
tmp = df_final.iloc[:,all_cyto][df_final.iloc[:,all_cyto] >0].count()
cyto = [df_final.columns.get_loc(c) for c in tmp[tmp>df_final.shape[0]*0.02].keys() if c in df_final]

clin=list(range(84,88))
demo=[82,83]
demo_without_age = [82]

eln_comp = eln + comp
eln_gen = eln + gen
eln_cyto = eln + cyto
eln_clin = eln + clin
eln_demo = eln + demo

# USEFUL FOR ELN COMPARISON
# with comp
eln_comp_gen = eln_comp + gen
eln_comp_cyto = eln_comp + cyto
eln_comp_clin = eln_comp + clin
eln_comp_demo = eln_comp + demo




eln_comp_gen_cyto = eln_comp_gen + cyto
eln_comp_gen_clin = eln_comp_gen + clin
eln_comp_gen_demo = eln_comp_gen + demo

eln_comp_cyto_clin = eln_comp_cyto + clin
eln_comp_cyto_demo = eln_comp_cyto + demo


eln_comp_clin_demo = eln_comp_clin + demo


eln_comp_gen_cyto_clin_demo = eln_comp_gen_cyto + clin + demo
eln_comp_gen_cyto_clin_demo_without_age = eln_comp_gen_cyto + clin + demo_without_age
# without comp


eln_gen_cyto = eln_gen + cyto
eln_gen_clin = eln_gen + clin
eln_gen_demo = eln_gen + demo


eln_cyto_clin = eln_cyto + clin
eln_cyto_demo = eln_cyto + demo

eln_clin_demo = eln_clin + demo
eln_clin_demo_without_age = eln_clin + demo_without_age


eln_gen_cyto_clin_demo = eln_gen_cyto + clin + demo

# USEFUL FOR COMP

comp_gen = comp + gen
comp_cyto = comp + cyto
comp_clin = comp + clin
comp_demo = comp + demo
comp_gen_cyto = comp_gen + cyto
comp_clin_demo = comp_clin + demo
comp_gen_cyto_clin_demo = comp_gen_cyto + clin + demo

#USEFUL FOR GEN
gen_cyto = gen + cyto
gen_clin = gen + clin
gen_demo = gen + demo
gen_clin_demo = gen_clin + demo
gen_cyto_clin_demo = gen_cyto + clin + demo

#USEFUL FOR CYTO 
cyto_clin = cyto + clin
cyto_demo = cyto + demo
gen_demo_without_age = gen + demo_without_age
cyto_clin_demo = cyto_clin + demo
cyto_gen_demo = gen_cyto + demo


clin_demo  = clin + demo



dict_features_type_final_comp = dict(zip(("gen_cyto_clin_demo","clin","eln_clin_demo","comp_clin_demo","eln_comp_gen_cyto_clin_demo"),
                                         (gen_cyto_clin_demo,clin,eln_clin_demo,comp_clin_demo,eln_comp_gen_cyto_clin_demo)))
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)
param_grid = {'alpha': 10. ** np.array([-6,-5,-4,-3,-2,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=-1, iid=False, refit=True,
                   cv=cv)
df=pd.DataFrame(columns=dict_features_type_final_comp.keys())
for key,item in dict_features_type_final_comp.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.OS_Status, df_final.OS)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(x), y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci
    
df.to_csv("SVMs.csv")

In [112]:
a = pd.read_csv("SVM_val.csv",sep=',')
a.drop(a.columns[0],axis=1,inplace=True)
b = pd.read_csv("SVMs.csv",sep=',')
b.drop(b.columns[0],axis=1,inplace=True)
a[b.columns]=b[b.columns]
a.to_csv("SVM_validation.csv")

In [None]:
pd.concat([a,b],axis=1).to_csv("SVM_val.csv")

In [111]:
a = pd.read_csv("SVM_val.csv",sep=',')
a.drop(a.columns[0],axis=1,inplace=True)
# a.describe()
a[b.columns]=b[b.columns]
a

Unnamed: 0,demo,clin,gen,cyto,comp,eln,gen_cyto,eln_gen_cyto,comp_gen_cyto,eln_comp,eln_clin_demo,comp_clin_demo,eln_comp_gen_cyto_clin_demo,cyto_gen_demo,gen_cyto_clin_demo
0,0.604039,0.54295,0.634754,0.589022,0.612136,0.613697,0.646289,0.64466,0.640657,0.626838,0.668159,0.667752,0.675714,0.658163,0.680689
1,0.610236,0.514387,0.672617,0.606935,0.677869,0.65988,0.688863,0.686655,0.692512,0.669339,0.686562,0.684842,0.675406,0.683029,0.685167
2,0.621296,0.498466,0.632803,0.596682,0.649833,0.610731,0.661253,0.65985,0.659653,0.647028,0.661582,0.674645,0.681922,0.685473,0.677933
3,0.637416,0.467029,0.653088,0.602074,0.64967,0.623907,0.660342,0.660924,0.665016,0.641369,0.703497,0.704288,0.714007,0.693871,0.705404
4,0.615978,0.545116,0.640543,0.583927,0.655219,0.625958,0.667105,0.665064,0.669963,0.647122,0.685478,0.710021,0.706301,0.677744,0.697183
5,0.602364,0.553943,0.606928,0.565898,0.60183,0.59829,0.632686,0.628033,0.619462,0.597756,0.653858,0.653591,0.659557,0.648025,0.666682
6,0.594569,0.53375,0.616305,0.579065,0.629647,0.61335,0.633459,0.644531,0.634487,0.622152,0.647893,0.64725,0.656844,0.649306,0.652647
7,0.601238,0.566103,0.63104,0.590002,0.656939,0.599738,0.649464,0.641085,0.665032,0.63861,0.670031,0.706641,0.694882,0.658081,0.690359
8,0.58544,0.534821,0.614215,0.556687,0.611737,0.606078,0.626671,0.615579,0.634308,0.618852,0.64274,0.658196,0.664788,0.640581,0.666879
9,0.615496,0.55839,0.667373,0.579438,0.672263,0.648375,0.692883,0.681503,0.693852,0.674291,0.706224,0.715419,0.717763,0.68628,0.724028


In [109]:
b

Unnamed: 0,gen_cyto_clin_demo,clin,eln_clin_demo,comp_clin_demo,eln_comp_gen_cyto_clin_demo
0,0.680689,0.54295,0.668159,0.667752,0.675714
1,0.685167,0.514387,0.686562,0.684842,0.675406
2,0.677933,0.498466,0.661582,0.674645,0.681922
3,0.705404,0.467029,0.703497,0.704288,0.714007
4,0.697183,0.545116,0.685478,0.710021,0.706301
5,0.666682,0.553943,0.653858,0.653591,0.659557
6,0.652647,0.53375,0.647893,0.64725,0.656844
7,0.690359,0.566103,0.670031,0.706641,0.694882
8,0.666879,0.534821,0.64274,0.658196,0.664788
9,0.724028,0.55839,0.706224,0.715419,0.717763


In [105]:
df_final.iloc[:,clin].columns

Index(['HB', 'BM_Blasts', 'PLT', 'WBC'], dtype='object')