# What is about 


ClinTrajan package:
https://github.com/auranic/ClinTrajan 

ElPiGraph package: 
https://github.com/j-bac/elpigraph-python


In [None]:
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

# Example of using ClinTrajan

# Part 1. Quantification of Data

### Importing/installing libraries

In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

from importlib import reload  
import scipy.stats


In [None]:
!pip install lifelines
from lifelines import KaplanMeierFitter
from lifelines.utils import concordance_index
!pip install  --no-dependencies  git+https://github.com/j-bac/elpigraph-python.git
import elpigraph
!pip install trimap

import sys
sys.path.insert(0,'/kaggle/input/breast-cancer-omics-bulk-data/code/')# "/path/to/your/package_or_module")
print(sys.path)

from clintraj_qi import *
from clintraj_eltree import *
from clintraj_util import *
from clintraj_ml import *
from clintraj_optiscale import *

### Loading data (categorical variables are assumed to be dummy-encoded already)

In [None]:
# load omics data
df1 = pd.read_csv('/kaggle/input/breast-cancer-omics-bulk-data/METABRIC.txt', sep = '\t', index_col = 0)
df1=df1.T
i1 = [s.replace('BRCA-METABRIC-S1-','') for s in df1.index ]
#print('number of common ids:', len(set(i2) & set(df1.index) ) )
df1.index = i1
df1
# load clinical data
df2 = pd.read_csv('/kaggle/input/breast-cancer-omics-bulk-data/METABRIC_clinical.txt', sep = '\t')#, index_col = 0)
df2 = df2.set_index('Patient ID')
df2
df = df2.join(df1, how = 'inner')
print('Joined data shape', df.shape)
df
m = df['Relapse Free Status'].notnull()
print( m.sum() )
df = df[m].copy()
df['Relapse Free Status'] = df['Relapse Free Status'].map({'0:Not Recurred':0,'1:Recurred':1 } )
print(df.shape)
display(df.head())

df_full = df.copy()
df = df.iloc[:,37:] # OMICS data only

In [None]:
display(df)
quantify_nans(df)

### Preprocessing 

Cut 1000 top variance variables 

Step is intended to use for gene expression data , so we leave 1000 out of dozen thousands of gene expressions 


Detect variable types

Quantify (and optimize) the ordinal variables via optimal scaling




In [None]:
X_var = df.values.var(axis = 0)
print( X_var.shape, X_var[:5] )
ix = np.argsort(X_var)
df = df.iloc[:,ix[-1000:]]
print(df.shape)

# Detect variable types
variable_types, binary, continuous, ordinal = detect_variable_type(df,10,verbose=False)
# All variables for current data will be recognized as "continous"
print( type(variable_types), type(binary), type(continuous), type(ordinal) ) # All are list
print( variable_types[:3], binary[:3], continuous[:3], ordinal[:3] )
print()


#Now we quantify (and optimize) the ordinal variables via optimal scaling
# Now, we are ready to quantify the data table. We will do it by applying optimal scaling to the ordinal values.
df = remove_constant_columns_from_dataframe(df)
variable_names = [str(s) for s in df.columns[0:]]
X = df.to_numpy().copy()
X_original = X.copy()
X_before_scaling = X.copy()
X,cik = optimal_scaling(X,variable_types,verbose=True,vmax=0.6)
X_save = X.copy()


In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4color = df_full[f]

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA
r = pca().fit_transform(X = X )
plt.figure(figsize = (20,10))
sns.scatterplot( x=r[:,0], y=r[:,1], hue = vec4color )
plt.title('PCA for Omics data colored by Pam50 groups')
plt.show()

In [None]:
X.shape

#### OK, we finished preparing the data matrix X, which is now complete and properly quantified. We also keep the 'original matrix' X_original, with 'raw' values of the variables (will be needed for visualizations)

# Part 2. Computing the principal tree

## Visualization function

In [None]:
from sklearn.decomposition import PCA
try :
    import umap
except:
    print('cannot import umap')

def plot_graph(edges, nodes_positions, data = None, dim_reduction = 'PCA', graph_color = 'black', graph_linewidth=2, 
               plot_data = True, data_linewidth = 1,  data_color = None, palette = None,  
               data_transparency_alpha = 0.9,
               showNodeNumbers = True, # Shows text with internal number of each node
               umap_n_neighbors = 50, umap_min_dist = 0.99):
  '''
  #' Plots graphs defined by edges and nodes_positions, optionally - scatter plot the "data" on the same plot,
  #' Optionally performs PCA/etc (depending on dim_reduction)
  #'
  #' @param edges Nx2-shape matrix with edges ends, i.e. edges[k,0], edges[k,1] - ends of k-th edge  
  #' @param nodes_positions  matrix of nodes positions 
  #' @param data  "original dataset", basically arbitrary dataset for scatter plot, it should have same shape[1] as nodes_positions
  #' @param plot_data  True/False - to scatterplot or not data
  #' @param dim_reduction  'PCA', 'plot_first2axis', 'umap'
  #' @param data_color can be a vector or predefined color - argument for c = data_color in scatter

  #' @examples
  # edges = np.array([ [0,1],[1,2],[2,0] ] )
  # nodes_positions = np.random.rand(3,10) # 3 points in 10d space
  # plot_graph(edges, nodes_positions)
  #
  # t = elpigraph_output
  # edges = t[0]['Edges'][0]
  # nodes_positions = t[0]['NodePositions']
  # plot_graph(edges, nodes_positions)
  '''
  str_dim_reduction = dim_reduction
  if dim_reduction in ['PCA', 'umap' ]: #  not 'plot_first2axis':
    if dim_reduction.upper() == 'PCA':
      reducer = PCA()
    elif dim_reduction.lower() == 'umap':
      n_neighbors = umap_n_neighbors#  50
      min_dist= umap_min_dist # 0.99
      #n_components=n_components
      reducer = umap.UMAP( n_neighbors=n_neighbors,        min_dist=min_dist, n_components = 2)

    if data is not None:
      data2 = reducer.fit_transform(data)
      if plot_data == True:
        if data_color is None:
          plt.scatter(data2[:,0],data2[:,1], linewidth = data_linewidth , alpha = data_transparency_alpha)# ,cmap=plt.cm.Paired) # ,c=np.array(irx) 
          plt.xlabel(str_dim_reduction+'1')
          plt.ylabel(str_dim_reduction+'2')
        else:
          #plt.scatter(data2[:,0],data2[:,1] ,cmap=plt.cm.Paired,c= data_color, linewidth = data_linewidth, alpha = data_transparency_alpha ) 
          if palette is None:
              sns.scatterplot( x=data[:,0], y=data[:,1], hue = data_color )# ,   palette=['tab:orange', 'tab:green','tab:pink','tab:brown','tab:purple']  )
          else:
            sns.scatterplot( x=data[:,0], y=data[:,1], hue = data_color,   palette=palette)# ['tab:orange', 'tab:green','tab:pink','tab:brown','tab:purple']  )

          plt.xlabel(str_dim_reduction+'1')
          plt.ylabel(str_dim_reduction+'2')
    else:
      reducer.fit(nodes_positions)

    nodes_positions2 = reducer.transform( nodes_positions )
  else:
    if plot_data == True:
      if data is not None:
        if data_color is None:
          plt.scatter(data[:,0],data[:,1] , linewidth = linewidth, alpha = data_transparency_alpha )# ,cmap=plt.cm.Paired) # ,c=np.array(irx) 
        else:
          plt.scatter(data[:,0],data[:,1] ,cmap=plt.cm.Paired,c= data_color , linewidth = data_linewidth, alpha = data_transparency_alpha ) 
          #sns.scatterplot( x=data[:,0], y=data[:,1], hue = data_color )

    nodes_positions2 = nodes_positions

  plt.scatter(nodes_positions2[:,0],nodes_positions2[:,1],c = graph_color, linewidth = graph_linewidth)#, cmap=plt.cm.Paired)

  edgeCount = edges.shape[0]
  for k in range(edgeCount):
    n0 = edges[k,0]
    n1 = edges[k,1]
    x_line = [ nodes_positions2[n0,0],  nodes_positions2[n1,0] ]
    y_line = [ nodes_positions2[n0,1],  nodes_positions2[n1,1] ]
    plt.plot(x_line, y_line, graph_color, linewidth = graph_linewidth) # 'black')

  if showNodeNumbers:
    for i in range(nodes_positions2.shape[0]):
      plt.text(nodes_positions2[i,0],nodes_positions2[i,1],str(i),FontSize=20,bbox=dict(facecolor='grey', alpha=0.5))    
    
edges = np.array([ [0,1],[1,2],[2,0] ] )
nodes_positions = np.random.rand(3,10) # 3 points in 10d space
plot_graph(edges, nodes_positions)
plt.title('Example graph plot with  plot_graph function')
plt.show()

## First of all, we will reduce the dimension using PCA

In [None]:
reduced_dimension = 30
X = scipy.stats.zscore(X)
pca = PCA(n_components=X.shape[1],svd_solver='full')
Y = pca.fit_transform(X)
v = pca.components_.T
mean_val = np.mean(X,axis=0)
X = Y[:,0:reduced_dimension]

## We are ready to compute the principal tree, let us do it

In [None]:
#import sys
#print(sys.path)
#sys.path.append('/home/zinovyev/anaconda3/lib/python3.7/site-packages')
#print(sys.path)

nnodes = 20
tree_elpi = elpigraph.computeElasticPrincipalTree(X,nnodes, # drawPCAView=True,
                                                  alpha=0.01,Mu=0.1,Lambda=0.05,
                                                  FinalEnergy='Penalized')
tree_elpi = tree_elpi[0]
# some additional pruning of the graph
prune_the_tree(tree_elpi)
# extend the leafs to reach the extreme data points
tree_extended = ExtendLeaves_modified(X, tree_elpi, Mode = "QuantDists", ControlPar = .5, DoSA = False)

In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4colors = df_full[f].values


tree = tree_extended
#nodes_positions = tree['NodePositions'] # ['AllNodePositions'][k]
#matrix_edges_weights = tree['ElasticMatrix'] # ['AllElasticMatrices'][k]
#matrix_edges_weights = np.triu(matrix_edges_weights,1 )
#edges = np.array( np.nonzero(matrix_edges_weights), dtype = int ).transpose()    
nodes_positions = tree['NodePositions']
edges = tree['Edges'][0]

plt.figure(figsize = (12,12))
#plot_graph(edges, nodes_positions, data = X , data_color = 'tab:blue', data_transparency_alpha = 0.3 )
plot_graph(edges, nodes_positions, data = X , data_color = vec4colors) # df[f]) # 'tab:blue', data_transparency_alpha = 0.3 )
plt.grid()
plt.legend( fontsize =15 )
plt.title('PCA for METABRIC dataset and PAM50(+Claudin-low) subtypes', fontsize =15)
plt.show()

In [None]:
# paritioning the data by tree branches
vec_labels_by_branches = partition_data_by_tree_branches(X,tree_extended)
print(len(set(vec_labels_by_branches)),'labels generated')

dict_groups_correspondence_approximate = {0:'Graph LumAB', 1:'Graph LumB',2:'Graph Basal',3:'Graph Her2',4:'Graph LumA'}
vec4colors = list( map(lambda x: dict_groups_correspondence_approximate[x] ,  vec_labels_by_branches ) )
print(pd.Series(vec_labels_by_branches).value_counts() )
print(pd.Series(vec4colors).value_counts() )


# Trajectories and Pseudotime

In [None]:
root_node = 20
all_trajectories,all_trajectories_edges = extract_trajectories(tree_extended,root_node)
print(len(all_trajectories),' trajectories found.')
ProjStruct = project_on_tree(X,tree_extended)
PseudoTimeTraj = quantify_pseudotime(all_trajectories,all_trajectories_edges,ProjStruct)

for i,pstt in enumerate(PseudoTimeTraj):
    if len(all_trajectories_edges[i]) == 0: continue 
    #if i == 0: continue 
    #if i == 1: continue 
        
    TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
    points = pstt['Points']
    print(pstt)
    print(np.mean(pstt['Pseudotime']))
#pstt    

In [None]:
c = 0; fig = plt.figure(figsize = (20,6))

for i,pstt in enumerate(PseudoTimeTraj):
    if len(all_trajectories_edges[i]) == 0: continue 
    #if i == 0: continue 
    #if i == 1: continue 
    
    c+=1; fig.add_subplot(1, 3 , c) 
        
    TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
    points = pstt['Points']
    print(pstt)
    

    plt.hist(pstt['Pseudotime'] )
    plt.title(TrajName)
    print(np.mean(pstt['Pseudotime']) ,np.median(pstt['Pseudotime'])  )
    print()

plt.show()
    

#  Pseudotime is a good prognostic factor for survival

Show KM plots for intervals of pseudo-time along three trajectories.


In [None]:
c1, c2 = 'Relapse Free Status (Months)', 'Relapse Free Status'
T0 = df_full[c1] 
E0 = df_full[c2]
c = 0; fig = plt.figure(figsize = (24,6))

for i,pstt in enumerate(PseudoTimeTraj):
    if len(all_trajectories_edges[i]) == 0: continue 
    #if i == 0: continue 
    #if i == 1: continue 
        
    TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
    points = pstt['Points']


    c+=1; fig.add_subplot(1, 3 , c) 
    for flag in [0,1]: #   np.unique( vec4types):
        T = T0[pstt['Points']].copy()
        E = E0[pstt['Points']].copy()

        lbl = flag
        threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
        if flag == 0:
            mask = pstt['Pseudotime'] <= threshold
            lbl = 'Pseudotime <= ' + str(threshold)
        else:
            mask = pstt['Pseudotime'] > threshold
            lbl = 'Pseudotime > ' + str(threshold)

        print(mask.sum() )
        T = T[mask]
        E = E[mask]

        #T = df['Overall Survival (Months)'][m]
        #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
        kmf = KaplanMeierFitter(label=lbl)
        kmf.fit(T,E)
        kmf.plot() #color = dict_colors4pam50_types[uv])    
    plt.xlim([0,250])
    plt.xlabel('timeline', size =15)
    plt.legend( fontsize =15)
    plt.ylim([0.30,1])

    plt.title('Relapse Free ' + TrajName , fontsize =15) # str(c2.split(' ')[:2] ) ) 


#  Pseudotime is not better than some of the genes (in particular cell cycle)

In [None]:
plt.hist( df['CDC20'][pstt['Points']] )

In [None]:
c1, c2 = 'Relapse Free Status (Months)', 'Relapse Free Status'
T0 = df_full[c1] 
E0 = df_full[c2]


c = 0; fig = plt.figure(figsize = (24, 8))
for ig, gen_name in enumerate( [ 'CDC20']): # , 'UBE2C', 'MELK' ]:
    for i,pstt in enumerate(PseudoTimeTraj):
        if len(all_trajectories_edges[i]) == 0: continue 
        #if i == 0: continue 
        #if i == 1: continue 

        TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
        points = pstt['Points']


        c+=1; fig.add_subplot(1, 3 , c) 
        for flag in [0,1]: #   np.unique( vec4types):
            T = T0[pstt['Points']].copy()
            E = E0[pstt['Points']].copy()

            lbl = flag
            threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
            if flag == 0:
                mask = pstt['Pseudotime'] <= threshold
                lbl = 'Pseudotime <= ' + str(threshold)
            else:
                mask = pstt['Pseudotime'] > threshold
                lbl = 'Pseudotime > ' + str(threshold)

            #print(mask.sum() )
            T = T[mask]
            E = E[mask]

            #T = df['Overall Survival (Months)'][m]
            #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
            kmf = KaplanMeierFitter(label=lbl)
            kmf.fit(T,E)
            kmf.plot() #color = dict_colors4pam50_types[uv])    
        plt.xlim([0,250])
        plt.xlabel('timeline', size =15)
        plt.legend( fontsize =15)
        plt.ylim([0.30,1])


        if 1: # ig > 0:
            thres = 8
            for flag in [0,1]: #   np.unique( vec4types):

                T = T0[pstt['Points']].copy()
                E = E0[pstt['Points']].copy()

                lbl = flag
                threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
                thres = np.round( np.mean(df[gen_name][pstt['Points']]) , 1)
                if flag == 0: 
                    # m = pstt['Pseudotime'] < thres
                    mask = df[gen_name][pstt['Points']]        < thres
                    lbl = gen_name + ' <= ' + str(thres)
                else:
                    #m = pstt['Pseudotime'] >= thres
                    mask = df[gen_name][pstt['Points']]        >= thres
                    lbl = gen_name + ' <= ' + str(thres)

                #print(mask.sum() )
                T = T[mask]
                E = E[mask]

                #T = df['Overall Survival (Months)'][m]
                #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
                kmf = KaplanMeierFitter(label=lbl)
                kmf.fit(T,E)
                kmf.plot() #color = dict_colors4pam50_types[uv])    
            plt.xlim([0,250])
            plt.xlabel('timeline', size =15)
            plt.legend( fontsize =15)
            plt.ylim([0.30,1])


        plt.title('Relapse Free ' + TrajName , fontsize =15) # str(c2.split(' ')[:2] ) ) 
plt.show()


In [None]:
c1, c2 = 'Relapse Free Status (Months)', 'Relapse Free Status'
T0 = df_full[c1] 
E0 = df_full[c2]


c = 0; fig = plt.figure(figsize = (24,16))
for ig, gen_name in enumerate( ['CDC20', 'CDC20']): # , 'UBE2C', 'MELK' ]:
    for i,pstt in enumerate(PseudoTimeTraj):
        if len(all_trajectories_edges[i]) == 0: continue 
        #if i == 0: continue 
        #if i == 1: continue 

        TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
        points = pstt['Points']


        c+=1; fig.add_subplot(2, 3 , c) 
        for flag in [0,1]: #   np.unique( vec4types):
            T = T0[pstt['Points']].copy()
            E = E0[pstt['Points']].copy()

            lbl = flag
            threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
            if flag == 0:
                mask = pstt['Pseudotime'] <= threshold
                lbl = 'Pseudotime <= ' + str(threshold)
            else:
                mask = pstt['Pseudotime'] > threshold
                lbl = 'Pseudotime > ' + str(threshold)

            #print(mask.sum() )
            T = T[mask]
            E = E[mask]

            #T = df['Overall Survival (Months)'][m]
            #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
            kmf = KaplanMeierFitter(label=lbl)
            kmf.fit(T,E)
            kmf.plot() #color = dict_colors4pam50_types[uv])    
        plt.xlim([0,250])
        plt.xlabel('timeline', size =15)
        plt.legend( fontsize =15)
        plt.ylim([0.30,1])


        if ig > 0:
            thres = 8
            for flag in [0,1]: #   np.unique( vec4types):

                T = T0[pstt['Points']].copy()
                E = E0[pstt['Points']].copy()

                lbl = flag
                threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
                thres = np.round( np.mean(df[gen_name][pstt['Points']]) , 1)
                if flag == 0: 
                    # m = pstt['Pseudotime'] < thres
                    mask = df[gen_name][pstt['Points']]        < thres
                    lbl = gen_name + ' <= ' + str(thres)
                else:
                    #m = pstt['Pseudotime'] >= thres
                    mask = df[gen_name][pstt['Points']]        >= thres
                    lbl = gen_name + ' <= ' + str(thres)

                #print(mask.sum() )
                T = T[mask]
                E = E[mask]

                #T = df['Overall Survival (Months)'][m]
                #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
                kmf = KaplanMeierFitter(label=lbl)
                kmf.fit(T,E)
                kmf.plot() #color = dict_colors4pam50_types[uv])    
            plt.xlim([0,250])
            plt.xlabel('timeline', size =15)
            plt.legend( fontsize =15)
            plt.ylim([0.30,1])


        plt.title('Relapse Free ' + TrajName , fontsize =15) # str(c2.split(' ')[:2] ) ) 
plt.show()


In [None]:
c1, c2 = 'Relapse Free Status (Months)', 'Relapse Free Status'
T0 = df_full[c1] 
E0 = df_full[c2]


gen_name = 'CDC20'
for gen_name in ['CDC20', 'UBE2C', 'MELK' ]:
    c = 0; fig = plt.figure(figsize = (24,6))
    for i,pstt in enumerate(PseudoTimeTraj):
        if len(all_trajectories_edges[i]) == 0: continue 
        #if i == 0: continue 
        #if i == 1: continue 

        TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
        points = pstt['Points']


        c+=1; fig.add_subplot(1, 3 , c) 
        for flag in [0,1]: #   np.unique( vec4types):
            T = T0[pstt['Points']].copy()
            E = E0[pstt['Points']].copy()

            lbl = flag
            threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
            if flag == 0:
                mask = pstt['Pseudotime'] <= threshold
                lbl = 'Pseudotime <= ' + str(threshold)
            else:
                mask = pstt['Pseudotime'] > threshold
                lbl = 'Pseudotime > ' + str(threshold)

            #print(mask.sum() )
            T = T[mask]
            E = E[mask]

            #T = df['Overall Survival (Months)'][m]
            #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
            kmf = KaplanMeierFitter(label=lbl)
            kmf.fit(T,E)
            kmf.plot() #color = dict_colors4pam50_types[uv])    
        plt.xlim([0,250])
        plt.xlabel('timeline', size =15)
        plt.legend( fontsize =15)
        plt.ylim([0.30,1])


        thres = 8
        for flag in [0,1]: #   np.unique( vec4types):

            T = T0[pstt['Points']].copy()
            E = E0[pstt['Points']].copy()

            lbl = flag
            threshold = [4,6,4,4][i] # np.round( np.median( pstt['Pseudotime'] ) , 1)
            thres = np.round( np.mean(df[gen_name][pstt['Points']]) , 1)
            if flag == 0: 
                # m = pstt['Pseudotime'] < thres
                mask = df[gen_name][pstt['Points']]        < thres
                lbl = gen_name + ' <= ' + str(thres)
            else:
                #m = pstt['Pseudotime'] >= thres
                mask = df[gen_name][pstt['Points']]        >= thres
                lbl = gen_name + ' <= ' + str(thres)

            #print(mask.sum() )
            T = T[mask]
            E = E[mask]

            #T = df['Overall Survival (Months)'][m]
            #E =  df['Overall Survival Status'][m].map({'Living':1, 'Deceased':0} )
            kmf = KaplanMeierFitter(label=lbl)
            kmf.fit(T,E)
            kmf.plot() #color = dict_colors4pam50_types[uv])    
        plt.xlim([0,250])
        plt.xlabel('timeline', size =15)
        plt.legend( fontsize =15)
        plt.ylim([0.30,1])


        plt.title('Relapse Free ' + TrajName , fontsize =15) # str(c2.split(' ')[:2] ) ) 
    plt.show()


In [None]:
c1, c2 = 'Relapse Free Status (Months)', 'Relapse Free Status'
T0 = df_full[c1].copy()
E0 = df_full[c2].copy()
mask = T0 > 60
T0[mask] = 60
E0[mask] = 0

for i,pstt in enumerate(PseudoTimeTraj):
    if len(all_trajectories_edges[i]) == 0: continue 
    #if i == 0: continue 
    #if i == 1: continue 

    TrajName = 'Trajectory:'+str(pstt['Trajectory'][0])+'--'+str(pstt['Trajectory'][-1])
    points = pstt['Points']
    
    T = T0[pstt['Points']].copy()
    E = E0[pstt['Points']].copy()

    for ig, gen_name in enumerate( ['CDC20', 'UBE2C']): # , 'UBE2C', 'MELK' ]:
        continue 

