# Summary: This Notebook shows the experimental results using MSK data, including BIKG-based, panel based, TMB based OS predictive performance, etc.


In [None]:
import os
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sys

In [None]:
from KMPlot import subplots
from KMPlot import KMPlot

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sksurv.util import Surv
from sksurv.datasets import load_gbsg2
from sksurv.preprocessing import OneHotEncoder
from pysurvival.models.survival_forest import RandomSurvivalForestModel
from lifelines.utils import concordance_index as lfcindex
from sklearn.tree import DecisionTreeRegressor


In [None]:
#set random seed
#randomSeed=1
#randomSeed=2
#randomSeed=3
#randomSeed=4
#randomSeed=5
#randomSeed=6
#randomSeed=7
#randomSeed=8
#randomSeed=9
randomSeed=10

In [None]:
def display_summary(df, name:str=None):
    """Displays the head and summary statistics of a DataFrame.
    """
    if name:
        print(f"Summary of data for: {name}")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"\nSample of data:")
    display(df.sample(5))

### Loading preprocessed dataset including patient embedding, genomic features, survival information


In [None]:
whichSetID=1
work_dir='../Data/outputs/RobustTestUsingMSK/'
y_dataframe=np.load(work_dir+str(whichSetID)+'/y_dataframe.npy') 
y_holdout=np.load(work_dir+str(whichSetID)+'/y_holdout.npy') 
tmbValue=np.load(work_dir+str(whichSetID)+'/tmbValue.npy') 

In [None]:
patient_embedding_dataframe=pd.read_csv(work_dir+str(whichSetID)+'/patient_embedding_dataframe.csv',sep=',',index_col='source_label') 
patient_embedding_holdout=pd.read_csv(work_dir+str(whichSetID)+'/patient_embedding_holdout.csv',sep=',',index_col='source_label') 
genomic_features=pd.read_csv(work_dir+str(whichSetID)+'/genomic_features.csv',sep=',',index_col='SAMPLE_ID')

### Patient cohort statistics. For this MSK dataset, there are 1855 patients. The gene panel contains 481 genes

In [None]:
display_summary(genomic_features, "patient genomic features")

# Perform experiment and get the mean performance

### The following block is to perform 10 runs of cross validation and get average performance

In [None]:

#(patient_embedding_dataframe.join(genomic_features,how='left')).iloc[:,16:]
#np.random.rand(1602,16)
c_index_list=[]
for experimentID in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(
        patient_embedding_dataframe, y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=experimentID)
    downstream_model = RandomSurvivalForestModel(num_trees=100)
    y_train_censorship=[x[0] for x in y_train]
    y_train_time=[x[1] for x in y_train]
    downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
    y_test_censorship=[x[0] for x in y_test]
    y_test_time=[x[1] for x in y_test]
    y_pred=downstream_model.predict_risk(X_test)
    c_index = lfcindex(y_test_time, y_pred, y_test_censorship)

    #c_score = concordance_index(downstream_model, X_test, y_test_time, y_test_censorship, include_ties=False, additional_results=False)
    if c_index<0.5:
        print (1-c_index)
        c_index_list.append(1-c_index)
    else:
        print (c_index)
        c_index_list.append(c_index)


print ("Average performance:")
print (np.mean(c_index_list))
print (np.std(c_index_list))

In [None]:
def find_best(array):
    array = np.asarray(array)
    idx = array.argmax()
    return idx

### here is to identify the model with best validation performance

In [None]:
bestInd=find_best(c_index_list)
bestInd

# BIKG based OS predictive performance

### apply the model with best validation performance and evaluate its performance (concordence index)

In [None]:

random_state=bestInd
X_train, X_test, y_train, y_test = train_test_split(
    patient_embedding_dataframe, y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=random_state)
downstream_model = RandomSurvivalForestModel(num_trees=100)
y_train_censorship=[x[0] for x in y_train]
y_train_time=[x[1] for x in y_train]
downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
y_test_censorship=[x[0] for x in y_test]
y_test_time=[x[1] for x in y_test]
y_pred=downstream_model.predict_risk(X_test)
c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

### the performance on holdout dataset

In [None]:
y_pred_holdout=downstream_model.predict_risk(patient_embedding_holdout)
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]
c_index = lfcindex(y_holdout_time, y_pred_holdout, y_holdout_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

### the following block is to get the 75th percentile cutoff based on training dataset

In [None]:
# get the cutoff using training data
y_pred_dataframe=downstream_model.predict_risk(patient_embedding_dataframe)
cutoff_75_percentile=np.quantile(y_pred_dataframe, 0.75)
cutoff_75_percentile

### the following block draw Kaplan-Meier plots and the patients are stratified into high- versus low-risk group based on 75th percentile cutoff 

In [None]:
df=pd.DataFrame([y_pred_holdout,y_holdout_time,y_holdout_censorship]).T
df.columns=['predictRisk','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.predictRisk >= cutoff_75_percentile,'group']= "High"
df.loc[df.predictRisk < cutoff_75_percentile,'group']= "Low"


axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    x_legend = 0.5, y_legend=0.95,legend_font_size=13,
    label_height_adj=0.06,
    x_hr_legand=0.6,y_hr_legend=.1,hr_font_size=13, hr_color='black',
);
sns.despine(offset=2)

In [None]:
#df.to_csv('BIKGPredictedRisk_MSK.csv',index=False)

### the following block is used to compare the average mutation frequence between high- versus low-risk group

In [None]:
def compareGeneDiffBetweenHighVSLow(df,genomic_features,patient_embedding_holdout,tmbValue):
    topGeneList=genomic_features.loc[patient_embedding_holdout.index,['molecular_STK11','molecular_KRAS','molecular_KEAP1','molecular_TP53','molecular_SMARCA4','molecular_ATM','molecular_PTPRD','molecular_EPHA3','molecular_RBM10','molecular_FAT1']].reset_index(drop=True)
    topGeneList['tmb']=tmbValue
    topGeneList['group']=df['group']
    return topGeneList.groupby("group").mean()

In [None]:
compareGeneDiffBetweenHighVSLow(df,genomic_features,patient_embedding_holdout,tmbValue)


In [None]:
#patient_embedding_holdout['group']=list(df['group'])
#patient_embedding_holdout

In [None]:
#patient_embedding_holdout.groupby("group").mean()


In [None]:
#import umap
#import matplotlib.pyplot as plt
#standard_embedding = umap.UMAP(random_state=10).fit_transform(patient_embedding_holdout.iloc[:,0:16])
#color_dict = { 'High':'red', 'Low':'blue',}

#plt.scatter(standard_embedding[:, 0], standard_embedding[:, 1], c=[color_dict[i] for i in df['group']])


In [None]:
#df['OSThreeMonth']='LessOrEqual'

#df.loc[df['OS'] > 3, 'OSThreeMonth'] = 'Greater'

#df

In [None]:
"""
def sens_spec_rates(predictedRisk, OS, thresholds):
    sensitivity = []
    specificity = []
    tp_arr=[]
    fp_arr=[]
    tn_arr=[]
    fn_arr=[]
    for threshold in thresholds:
        cutoff_OS = np.where(OS >= threshold, 'greaterThan', 'lessThan')
        fn = np.sum((cutoff_OS == 'greaterThan') & (predictedRisk == 'High'))
        tn = np.sum((cutoff_OS == 'greaterThan') & (predictedRisk == 'Low'))
        fp = np.sum((cutoff_OS == 'lessThan') & (predictedRisk == 'Low'))
        tp = np.sum((cutoff_OS == 'lessThan') & (predictedRisk == 'High'))
        sensitivity.append(tp / (tp + fn))
        specificity.append(tn / (tn + fp))
        tp_arr.append(tp)
        fp_arr.append(fp)
        tn_arr.append(tn)
        fn_arr.append(fn)
    return [sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr]

"""

In [None]:
#thresholds=[3,6,9,12,15,18,21]
#[sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr]=sens_spec_rates(df.group, df.OS, thresholds)
#pd.DataFrame(zip(thresholds,sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr), columns = ['thresholds(month)', 'sensitivity', 'specificity','tp','fp','tn','fn'])

In [None]:
#df.groupby(['group', 'OSThreeMonth']).size()

# BIKG based (with addtional gene panel) OS preditive performance 

### similarly as previous experiment, this block is to evaluate the performance of (BIKG prior knowledge feature + genomic feature)

In [None]:
random_state=bestInd
X_train, X_test, y_train, y_test = train_test_split(
    patient_embedding_dataframe.join(genomic_features,how='left'), y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=random_state)
downstream_model = RandomSurvivalForestModel(num_trees=100)
y_train_censorship=[x[0] for x in y_train]
y_train_time=[x[1] for x in y_train]
downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
y_test_censorship=[x[0] for x in y_test]
y_test_time=[x[1] for x in y_test]
y_pred=downstream_model.predict_risk(X_test)
c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

In [None]:
y_pred_holdout=downstream_model.predict_risk(patient_embedding_holdout.join(genomic_features,how='left'))
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]
c_index = lfcindex(y_holdout_time, y_pred_holdout, y_holdout_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)


In [None]:
# get the cutoff using training data
y_pred_dataframe=downstream_model.predict_risk(patient_embedding_dataframe.join(genomic_features,how='left'))
cutoff_75_percentile=np.quantile(y_pred_dataframe, 0.75)
cutoff_75_percentile

In [None]:
df=pd.DataFrame([y_pred_holdout,y_holdout_time,y_holdout_censorship]).T
df.columns=['predictRisk','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.predictRisk >= cutoff_75_percentile,'group']= "High"
df.loc[df.predictRisk < cutoff_75_percentile,'group']= "Low"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

In [None]:
compareGeneDiffBetweenHighVSLow(df,genomic_features,patient_embedding_holdout,tmbValue)


# model using TMB

In [None]:
### for checking the predicted risk using BIKG embedding features
#df.to_excel("BIKGPredictedRisk.xlsx")

### This block of code is to evaluate the patient stratification using traditional TMB as biomarker. If TMB>75th percentifle cutoff, then High; else low.

In [None]:
# get 75% cutoff from training dataset
clinical_features = pd.read_csv('../Data/inputs/inputDatasetMSKMET2021/clinical_features_Lung_Adenocarcinoma.csv', sep=',')
#clinical_subgroup = clinical_features[((clinical_features['SAMPLE_TYPE'].isin(['Metastasis']))&(clinical_features['Metastatic patient']==True))]
clinical_subgroup = clinical_features[((clinical_features['SAMPLE_TYPE'].isin(['Primary']))&(clinical_features['Metastatic patient']==True))]
TMB_training=np.array(clinical_subgroup[clinical_subgroup['SAMPLE_ID'].isin(patient_embedding_dataframe.index)]['TMB_NONSYNONYMOUS'])
TMB_cutoff_75=np.quantile(TMB_training, 0.75)

In [None]:
TMB_cutoff_75

In [None]:

df=pd.DataFrame([tmbValue,y_holdout_time,y_holdout_censorship]).T
df.columns=['tmbValue','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.tmbValue >= TMB_cutoff_75,'group']= "High"
df.loc[df.tmbValue < TMB_cutoff_75,'group']= "Low"


axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='tmbValue').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

In [None]:
#df['OSThreeMonth']='LessOrEqual'

#df.loc[df['OS'] > 3, 'OSThreeMonth'] = 'Greater'

#df

In [None]:
#df.groupby(['group', 'OSThreeMonth']).size()

#thresholds=[3,6,9,12,15,18,21]
#[sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr]=sens_spec_rates(df.group, df.OS, thresholds)
#pd.DataFrame(zip(thresholds,sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr), columns = ['thresholds(month)', 'sensitivity', 'specificity','tp','fp','tn','fn'])

# MSK gene panel

### This block is to evaluate the performance of genomic feature (i.e. MSK gene panel)

In [None]:

random_state=bestInd
X_train, X_test, y_train, y_test = train_test_split(
    genomic_features.loc[patient_embedding_dataframe.index], y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=random_state)
downstream_model = RandomSurvivalForestModel(num_trees=100)
y_train_censorship=[x[0] for x in y_train]
y_train_time=[x[1] for x in y_train]
downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
y_test_censorship=[x[0] for x in y_test]
y_test_time=[x[1] for x in y_test]
y_pred=downstream_model.predict_risk(X_test)
c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

In [None]:
y_pred_holdout=downstream_model.predict_risk(genomic_features.loc[patient_embedding_holdout.index,])
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]
c_index = lfcindex(y_holdout_time, y_pred_holdout, y_holdout_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

In [None]:
# get the cutoff using training data
y_pred_dataframe=downstream_model.predict_risk(genomic_features.loc[patient_embedding_dataframe.index])
cutoff_75_percentile=np.quantile(y_pred_dataframe, 0.75)
cutoff_75_percentile

In [None]:

df=pd.DataFrame([y_pred_holdout,y_holdout_time,y_holdout_censorship]).T
df.columns=['predictRisk','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.predictRisk >= cutoff_75_percentile,'group']= "High"
df.loc[df.predictRisk < cutoff_75_percentile,'group']= "Low"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

In [None]:
#df.to_csv('MSKGenePanelPredictedRisk.csv',index=False)

In [None]:
compareGeneDiffBetweenHighVSLow(df,genomic_features,patient_embedding_holdout,tmbValue)

In [None]:
#df['OSThreeMonth']='LessOrEqual'

#df.loc[df['OS'] > 3, 'OSThreeMonth'] = 'Greater'

#df

In [None]:
#df.groupby(['group', 'OSThreeMonth']).size()
#thresholds=[3,6,9,12,15,18,21]
#[sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr]=sens_spec_rates(df.group, df.OS, thresholds)
#pd.DataFrame(zip(thresholds,sensitivity, specificity,tp_arr,fp_arr,tn_arr,fn_arr), columns = ['thresholds(month)', 'sensitivity', 'specificity','tp','fp','tn','fn'])

# MSK gene panel using autoencoder embedding

### The following block is to evaluate some other embedding approach for comparison purpose, here we used Autoencoder embedding to compare with graph embedding

In [None]:
import keras
from keras import layers

encoding_dim = 16 

train_x = genomic_features.loc[patient_embedding_dataframe.index]
test_x = genomic_features.loc[patient_embedding_holdout.index]

inputFeature = keras.Input(shape=(train_x.shape[1],))
# "encoded" is the encoded representation of the input
encoded = layers.Dense(encoding_dim, activation='relu')(inputFeature)
# "decoded" is the lossy reconstruction of the input
decoded = layers.Dense(train_x.shape[1], activation='sigmoid')(encoded)
autoencoder = keras.Model(inputFeature, decoded)

encoder = keras.Model(inputFeature, encoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(train_x, train_x,
                epochs=200,
                batch_size=100,
                shuffle=True,
                validation_data=(test_x, test_x),
                verbose=0)

encoded_features_train = encoder.predict(train_x)
encoded_features_test = encoder.predict(test_x)

In [None]:
random_state=bestInd
X_train, X_test, y_train, y_test = train_test_split(
    encoded_features_train, y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=random_state)
downstream_model = RandomSurvivalForestModel(num_trees=100)
y_train_censorship=[x[0] for x in y_train]
y_train_time=[x[1] for x in y_train]
downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
y_test_censorship=[x[0] for x in y_test]
y_test_time=[x[1] for x in y_test]
y_pred=downstream_model.predict_risk(X_test)
c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

In [None]:
y_pred_holdout=downstream_model.predict_risk(encoded_features_test)
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]
c_index = lfcindex(y_holdout_time, y_pred_holdout, y_holdout_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)


In [None]:
y_pred_dataframe=downstream_model.predict_risk(encoded_features_train)
cutoff_75_percentile=np.quantile(y_pred_dataframe, 0.75)
cutoff_75_percentile

In [None]:
df=pd.DataFrame([y_pred_holdout,y_holdout_time,y_holdout_censorship]).T
df.columns=['predictRisk','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.predictRisk >= cutoff_75_percentile,'group']= "High"
df.loc[df.predictRisk < cutoff_75_percentile,'group']= "Low"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

# PCA embedding

### Similarly as previous experiment, here we applied PCA embedding to compare with graph embedding

In [None]:
from sklearn.decomposition import PCA

encoding_dim = 16 

train_x = genomic_features.loc[patient_embedding_dataframe.index]
test_x = genomic_features.loc[patient_embedding_holdout.index]

pca = PCA(n_components=encoding_dim,random_state=123)
pca.fit(train_x)

X_train_pca = pca.transform(train_x)
X_test_pca = pca.transform(test_x)




In [None]:
random_state=bestInd
X_train, X_test, y_train, y_test = train_test_split(
    X_train_pca, y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=random_state)
downstream_model = RandomSurvivalForestModel(num_trees=100)
y_train_censorship=[x[0] for x in y_train]
y_train_time=[x[1] for x in y_train]
downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
y_test_censorship=[x[0] for x in y_test]
y_test_time=[x[1] for x in y_test]
y_pred=downstream_model.predict_risk(X_test)
c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)

In [None]:
y_pred_holdout=downstream_model.predict_risk(X_test_pca)
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]
c_index = lfcindex(y_holdout_time, y_pred_holdout, y_holdout_censorship)
if c_index<0.5:
    print (1-c_index)
else:
    print (c_index)


In [None]:
y_pred_dataframe=downstream_model.predict_risk(X_train_pca)
cutoff_75_percentile=np.quantile(y_pred_dataframe, 0.75)
cutoff_75_percentile

In [None]:
df=pd.DataFrame([y_pred_holdout,y_holdout_time,y_holdout_censorship]).T
df.columns=['predictRisk','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.predictRisk >= cutoff_75_percentile,'group']= "High"
df.loc[df.predictRisk < cutoff_75_percentile,'group']= "Low"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['High', 'Low'], ax=axs[0],
    comparisons=[['Low', 'High', 'Low vs High']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

# Identify variance importance and association with inputs

### The following code is used to identify feature importance. importance of each feature (the higher, the more important the feature is). The importance is the difference between the perturbed and unperturbed error rate for each feature.

In [None]:
importantFeature={}
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(
        patient_embedding_dataframe, y_dataframe, test_size=0.1,stratify=[x[0] for x in y_dataframe], random_state=i)
    downstream_model = RandomSurvivalForestModel(num_trees=100)
    y_train_censorship=[x[0] for x in y_train]
    y_train_time=[x[1] for x in y_train]
    downstream_model.fit(X=X_train, T=y_train_time, E=y_train_censorship,seed=randomSeed) 
    y_test_censorship=[x[0] for x in y_test]
    y_test_time=[x[1] for x in y_test]
    y_pred=downstream_model.predict_risk(X_test)
    c_index = lfcindex(y_test_time, y_pred, y_test_censorship)
    if c_index<0.5:
        print (1-c_index)
    else:
        print (c_index)
    #identify most important embedding feature associated with survial prediction
    mostImportantFeatures=downstream_model.variable_importance_table.head(10)
    # fit a decision tree regression model to assocaite the most important embedding feature with molecular features
    regressor = DecisionTreeRegressor(random_state=i)
    genomic_features_train=genomic_features.loc[patient_embedding_dataframe.index,]
    regressor.fit(genomic_features_train, patient_embedding_dataframe[mostImportantFeatures.loc[0,'feature']])
    # sort the genomic features in decreasing order of their importance
    importance = regressor.feature_importances_
    indices = np.argsort(importance)[::-1]
    # select the top 10 genomic features
    rankTable=pd.DataFrame(list(zip(genomic_features_train.columns[indices],importance[indices])),columns=['FeatureName','Importance'])
    selected=rankTable.iloc[0:10,:]
    # store the feature name into a dictionary with frequency
    genomicFeatureList=list(selected['FeatureName'])
    for gene in genomicFeatureList:
        if gene not in importantFeature:
            importantFeature[gene]=1
        else:
            importantFeature[gene]=importantFeature[gene]+1

In [None]:
topFeaturesAmongTenModels=sorted(importantFeature.items(), key=lambda x: x[1], reverse=True)[:10]
topFeaturesAmongTenModels

In [None]:
def calculateMutationalFreq(df,topFeatureList):
    return np.sum(df.loc[:,topFeatureList])/df.shape[0]

In [None]:
topFeatureList=[gene for (gene, frequency) in topFeaturesAmongTenModels]

calculateMutationalFreq(genomic_features.loc[patient_embedding_dataframe.index],topFeatureList)

In [None]:
topFeatureList=[gene for (gene, frequency) in topFeaturesAmongTenModels]

calculateMutationalFreq(genomic_features.loc[patient_embedding_holdout.index],topFeatureList)

In [None]:
genomic_features[['molecular_PIK3CA']].value_counts()/len(genomic_features)

# gene signature test (biomarker + vs -)

### Based on the above importance genomic features, a gene signature can be constructed using those genes. The gene signature is defined as "Mut" if more than one of those genes are mutant. Else, "Wt", wild type

In [None]:
y_dataframe_censorship=[x[0] for x in y_dataframe]
y_dataframe_time=[x[1] for x in y_dataframe]


mutations_train=genomic_features.loc[patient_embedding_dataframe.index][['molecular_STK11','molecular_KRAS','molecular_KEAP1','molecular_TP53','molecular_SMARCA4',
                                                                         'molecular_ATM','molecular_EPHA3','molecular_PTPRD','molecular_RBM10','molecular_FAT1']]
mutations_train['signature']=mutations_train.sum(axis=1)


df=pd.DataFrame([list(mutations_train['signature']),y_dataframe_time,y_dataframe_censorship]).T

df.columns=['signature','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.signature >1,'group']= "Mut"
df.loc[df.signature ==0,'group']= "Wt"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['Mut', 'Wt'], ax=axs[0],
    comparisons=[['Wt', 'Mut', 'Wt vs Mut']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

In [None]:
y_holdout_censorship=[x[0] for x in y_holdout]
y_holdout_time=[x[1] for x in y_holdout]

mutations_holdout=genomic_features.loc[patient_embedding_holdout.index][['molecular_STK11','molecular_KRAS','molecular_KEAP1','molecular_TP53','molecular_SMARCA4',
                                                                         'molecular_ATM','molecular_EPHA3','molecular_PTPRD','molecular_RBM10','molecular_FAT1']]
mutations_holdout['signature']=mutations_holdout.sum(axis=1)
mutations_holdout

df=pd.DataFrame([list(mutations_holdout['signature']),y_holdout_time,y_holdout_censorship]).T

df.columns=['signature','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.signature >1,'group']= "Mut"
df.loc[df.signature ==0,'group']= "Wt"



axs = subplots(cols=1, rows=1, w=6, h=4)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='predictRisk').plot(
    ['Mut', 'Wt'], ax=axs[0],
    comparisons=[['Wt', 'Mut', 'Wt vs Mut']],
    saturation=0.9,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',xy_font_size=18,
    hr_color='black',
    x_legend = 0.5, y_legend=0.95,legend_font_size=12,
    label_height_adj=0.06,
    x_hr_legand=0.0,y_hr_legend=.1,hr_font_size=12,
);



sns.despine(offset=2)

# single gene survival prediction

### the following function is to evaluate whether a single gene can be used to stratify patient OS

In [None]:
#y_dataframe_censorship=[x[0] for x in y_holdout]
#y_dataframe_time=[x[1] for x in y_holdout]

mutation=list(genomic_features.loc[patient_embedding_holdout.index,'molecular_FAT1'])

df=pd.DataFrame([mutation,y_holdout_time,y_holdout_censorship]).T
df.columns=['mutation','OS','censorLabel']
df['group'] = 'Unknown'
df.loc[df.mutation == 1,'group']= "mutated"
df.loc[df.mutation == 0,'group']= "notMutated"



axs = subplots(cols=1, rows=1, w=8, h=5)
KMPlot(df, time='OS', event='censorLabel', label=[ 'group'], score='mutation').plot(
    ['mutated', 'notMutated'], ax=axs[0],
    comparisons=[['notMutated', 'mutated']],
    label_font_size = 15,
    xy_font_size=18,
    saturation=0.9,
    label_height_adj=0.2,
    linewidth=1.5,
    palette='Set1',
    template_color = 'black',
    
);

# plot BIKG predicted risk vs all genomic predict risk

### the following block is used to evaluated OS prediction in models using BIKG versus gene panel only. 

In [None]:
BIKGPredictedRisk=pd.read_csv('../Data/outputs/BIKGPredictedRisk_MSK.csv')
BIKGPredictedRisk

In [None]:
MSKGenePanelPredictedRisk=pd.read_csv('../Data/outputs/MSKGenePanelPredictedRisk.csv')
MSKGenePanelPredictedRisk

In [None]:
import matplotlib.pyplot as plt
plt_df=pd.DataFrame(list(zip(BIKGPredictedRisk.predictRisk,MSKGenePanelPredictedRisk.predictRisk,BIKGPredictedRisk.OS)),columns=['BIKGPrior','MSKPanel','OS'])

plt_df.sort_values(by=['OS'],ascending=True,inplace=True)
plt_df

In [None]:
x=np.arange(plt_df.shape[0])
y_BIKG=list(plt_df.BIKGPrior)
y_MSK=list(plt_df.MSKPanel)
y_OS=list(plt_df.OS)

In [None]:
for n in [18, 35, 53,71]:
    avgPredictedRisk_BIKG=np.mean(y_BIKG[:n])
    avgPredictedRisk_MSK=np.mean(y_MSK[:n])
    print((avgPredictedRisk_BIKG-avgPredictedRisk_MSK)/avgPredictedRisk_MSK)

In [None]:
for n in [18, 35, 53,71]:
    avgPredictedRisk_BIKG=np.mean(y_BIKG[-n:])
    avgPredictedRisk_MSK=np.mean(y_MSK[-n:])
    print((avgPredictedRisk_BIKG-avgPredictedRisk_MSK)/avgPredictedRisk_MSK)

In [None]:
# Create subplots
#fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8, 6))

fig, ax1 = plt.subplots(1, 1, sharex=True, figsize=(8, 6))

# Plot "Risk Prediction" on the first subplot
ax1.scatter(x, y_BIKG, c='r', s=1, label='BIKG')
model_BIKG = LinearRegression()
model_BIKG.fit(x.reshape(-1, 1), y_BIKG)
slope_BIKG = model_BIKG.coef_[0]
intercept_BIKG = model_BIKG.intercept_

ax1.plot(x, slope_BIKG * x + intercept_BIKG, 'r--', linewidth=2)

# Set y-axis limits for the first subplot to constrain the predicted risk between 400 and 650
ax1.set_ylim(200, 1000)

ax1.scatter(x,y_MSK,c='b',s=1,label='MSK gene panel')
model_MSK = LinearRegression()
model_MSK.fit(x.reshape(-1, 1), y_MSK)
slope_MSK = model_MSK.coef_[0]
intercept_MSK = model_MSK.intercept_
ax1.plot(x, slope_MSK * x + intercept_MSK,'b--',linewidth=2)


# Plot "OS" on the second subplot
#ax2.scatter(x, y_OS, c='g', s=1, label='OS')

# Customize the appearance of each subplot
ax1.set_ylabel('Predicted risk',fontsize=16)
ax1.set_xlabel('Patient sorted in the ascending OS order',fontsize=16)
#ax2.set_ylabel('OS')
#ax2.set_xlabel('Patient')

# Add legends to the subplots
ax1.legend(fontsize=16)
#ax2.legend()

# Add legends to the top right corner of the subplots
ax1.legend(loc='upper right',fontsize=16)
#ax2.legend(loc='upper right')

# Show the plot
plt.show()

In [None]:
slope_BIKG-slope_MSK

In [None]:
slope_BIKG

In [None]:
slope_MSK

# heat map of co-occurrence

### the following block is used to generate a heatmap of gene co-occurrence

In [None]:
importantGeneList=['molecular_STK11','molecular_KRAS','molecular_KEAP1','molecular_TP53','molecular_SMARCA4','molecular_ATM','molecular_PTPRD','molecular_EPHA3','molecular_RBM10','molecular_FAT1']


In [None]:
genomic_features[importantGeneList]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


def gene_mutations_plot(gene_mutations_df):
    # Count the number of mutations for each patient
    gene_mutations_df['mutation_count'] = gene_mutations_df.iloc[:, 0:].sum(axis=1)

    # Create bins for different mutation counts
    mutation_bins = [2, 3, 4, 5, gene_mutations_df['mutation_count'].max() + 1]

    # Create labels for the bins
    mutation_labels = ['at least two gene mutation', 
                       'at least three gene mutation', 
                       'at least four gene mutation', 
                       'more than four gene mutation']

    # Group patients into the bins and count the occurrences
    mutation_counts = pd.cut(gene_mutations_df['mutation_count'], bins=mutation_bins, labels=mutation_labels, right=False).value_counts()

    # Plot the bar graph
    mutation_counts.plot(kind='bar')
    plt.xlabel('Mutation Count')
    plt.ylabel('Number of Patients')
    plt.title('Distribution of Patients by Gene Mutation Count')
    plt.show()

In [None]:
gene_mutations_plot(genomic_features[importantGeneList])

In [None]:
sns.set(font_scale=1.5)

In [None]:
def getCooccurence(df,geneListOfInterest):
    df_filter=df[geneListOfInterest]
    df_filter.columns = df_filter.columns.str.lstrip("molecular_")
    df_asint = df_filter.astype(int)
    cooccurMat = df_asint.T.dot(df_asint)
    return cooccurMat

def plotCoocurMat(cooccurMat):
    fig, ax = plt.subplots(figsize=(15,10)) 
    sns.heatmap(cooccurMat,annot=True,cmap="crest",ax=ax,fmt='d')
    ax.set(xlabel="", ylabel="")
    ax.xaxis.tick_top()

In [None]:
cooccurMat_MSK = getCooccurence(genomic_features.loc[patient_embedding_dataframe.index],importantGeneList)
plotCoocurMat(cooccurMat_MSK)

In [None]:
cooccurMat_MSKholdout = getCooccurence(genomic_features.loc[patient_embedding_holdout.index],importantGeneList)
plotCoocurMat(cooccurMat_MSKholdout)