# What is about 

Construct trajectories for OMICS part of METABRIC breast cancer dataset, color it by PAM50 clusters. 
Trajectories constructed by ElPiGraph package. 
We study different preprocessings and params and choose the best from our point of view.

The notebook shows results for many versions of params and preprossings. 
The best scheme: preprocessings from ClinTrajan package (including stanard scaler), preserve only 1000 genes with highest variance,
make PCA to 30 dimensions, construct ElPiGraph with 20 nodes using additionally use prune_the_tree + ExtendLeaves_modified from ClingTrajan. 

The main conlusions is  that clusters which can be defined by segments of trajectories 
quite correspond to standard PAM50 clusters. (In subsequent parts we will argue that these clusters might even have certain advantages
over standard clusters).

ElPiGraph package: https://github.com/j-bac/elpigraph-python

ClinTrajan package: https://github.com/auranic/ClinTrajan/ (downloaded to the present kaggle dataset folder)


Version 7: just introduction updated

Version 6: test additional preprocessing step - optimal_scaling from ClinTrajan package 

***Conlusion:***  Same params as before are quite good. 

Version 5: use standard scaler, additionally check the range(15,25) of nodes values, 
using options: prune_the_tree(tree), tree = ExtendLeaves_modified(X, tree, Mode = "QuantDists", ControlPar = .5, DoSA = False)

***Conlusion:*** 20 nodes with standard scaler + prune_the_tree + ExtendLeaves_modified provides 
most reasonable trajectories i.e. branching points correspond to "transition" from one PAM50 to other PAM50 cluster 


Version 4: (use standard scaler:)
X = scipy.stats.zscore(X)


Version 3: (do not use standard scaler:)
comment line: X = scipy.stats.zscore(X)



In [None]:
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

# Import

In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

import time

from importlib import reload  
import scipy.stats

!pip install  --no-dependencies  git+https://github.com/j-bac/elpigraph-python.git
import elpigraph

# Prepare data

In [None]:
# load omics data
df1 = pd.read_csv('/kaggle/input/breast-cancer-omics-bulk-data/METABRIC.txt', sep = '\t', index_col = 0)
df1=df1.T
i1 = [s.replace('BRCA-METABRIC-S1-','') for s in df1.index ]
#print('number of common ids:', len(set(i2) & set(df1.index) ) )
df1.index = i1
df1
# load clinical data
df2 = pd.read_csv('/kaggle/input/breast-cancer-omics-bulk-data/METABRIC_clinical.txt', sep = '\t')#, index_col = 0)
df2 = df2.set_index('Patient ID')
df2
df = df2.join(df1, how = 'inner')
print('Joined data shape', df.shape)
df
m = df['Relapse Free Status'].notnull()
print( m.sum() )
df = df[m].copy()
df['Relapse Free Status'] = df['Relapse Free Status'].map({'0:Not Recurred':0,'1:Recurred':1 } )
print(df.shape)
display(df.head())

df_full = df.copy()
df = df.iloc[:,37:] # OMICS data only

In [None]:
df_full['Pam50 + Claudin-low subtype'].value_counts()

# Visualization

Also cut 1000 highly variable genes 

In [None]:
X = df.values.copy()

In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4color = df_full[f]

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA
r = pca().fit_transform(X = X )
plt.figure(figsize = (20,10))
sns.scatterplot( x=r[:,0], y=r[:,1], hue = vec4color )
plt.title('PCA for Omics data colored by Pam50 groups')
plt.show()

In [None]:
X_var = X.var(axis = 0)
print( X_var.shape, X.shape, X_var[:5] )
ix = np.argsort(X_var)
X = X[:,ix[-1000:]]


In [None]:
df = df.iloc[:,ix[-1000:]]
df

In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA
r = pca().fit_transform(X = X )
plt.figure(figsize = (20,10))
sns.scatterplot( x=r[:,0], y=r[:,1], hue = vec4color )
plt.title('PCA for Omics data colored by Pam50 groups')
plt.show()

In [None]:
!pip install trimap

import sys
sys.path.insert(0,'/kaggle/input/breast-cancer-omics-bulk-data/code/')# "/path/to/your/package_or_module")
print(sys.path)

from clintraj_qi import *
from clintraj_eltree import *
from clintraj_util import *
from clintraj_ml import *
from clintraj_optiscale import *

In [None]:
variable_types, binary, continuous, ordinal = detect_variable_type(df,10,verbose=False)

In [None]:
df = remove_constant_columns_from_dataframe(df)
variable_names = list(df.columns)
X = df.to_numpy().copy()
X_original = X.copy()
X_before_scaling = X.copy()
X,cik = optimal_scaling(X,variable_types,verbose=True,vmax=0.6)
X_save = X.copy()
X.shape

In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA
r = pca().fit_transform(X = X )
plt.figure(figsize = (20,10))
sns.scatterplot( x=r[:,0], y=r[:,1], hue = vec4color )
plt.title('PCA for Omics data colored by Pam50 groups')
plt.show()

# Reduce dimensions

In [None]:
reduced_dimension = 30
X = scipy.stats.zscore(X)
pca = PCA(n_components=X.shape[1],svd_solver='full')
Y = pca.fit_transform(X)
v = pca.components_.T
mean_val = np.mean(X,axis=0)
X = Y[:,0:reduced_dimension]

In [None]:
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4color = df_full[f]

from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA
r = pca().fit_transform(X = X )
plt.figure(figsize = (20,10))
sns.scatterplot( x=r[:,0], y=r[:,1], hue = vec4color )
plt.title('PCA for Omics data colored by Pam50 groups')
plt.show()

# Auxilliary function to plot graphs and data

In [None]:
from sklearn.decomposition import PCA
try :
    import umap
except:
    print('cannot import umap')

def plot_graph(edges, nodes_positions, data = None, dim_reduction = 'PCA', graph_color = 'black', graph_linewidth=2, 
               plot_data = True, data_linewidth = 1,  data_color = 'tab:red', data_transparency_alpha = 0.9,
               showNodeNumbers = False, # Shows text with internal number of each node
               umap_n_neighbors = 50, umap_min_dist = 0.99):
  '''
  #' Plots graphs defined by edges and nodes_positions, optionally - scatter plot the "data" on the same plot,
  #' Optionally performs PCA/etc (depending on dim_reduction)
  #'
  #' @param edges Nx2-shape matrix with edges ends, i.e. edges[k,0], edges[k,1] - ends of k-th edge  
  #' @param nodes_positions  matrix of nodes positions 
  #' @param data  "original dataset", basically arbitrary dataset for scatter plot, it should have same shape[1] as nodes_positions
  #' @param plot_data  True/False - to scatterplot or not data
  #' @param dim_reduction  'PCA', 'plot_first2axis', 'umap'
  #' @param data_color can be a vector or predefined color - argument for c = data_color in scatter

  #' @examples
  # edges = np.array([ [0,1],[1,2],[2,0] ] )
  # nodes_positions = np.random.rand(3,10) # 3 points in 10d space
  # plot_graph(edges, nodes_positions)
  #
  # t = elpigraph_output
  # edges = t[0]['Edges'][0]
  # nodes_positions = t[0]['NodePositions']
  # plot_graph(edges, nodes_positions)
  '''
  str_dim_reduction = dim_reduction
  if dim_reduction in ['PCA', 'umap' ]: #  not 'plot_first2axis':
    if dim_reduction.upper() == 'PCA':
      reducer = PCA()
    elif dim_reduction.lower() == 'umap':
      n_neighbors = umap_n_neighbors#  50
      min_dist= umap_min_dist # 0.99
      #n_components=n_components
      reducer = umap.UMAP( n_neighbors=n_neighbors,        min_dist=min_dist, n_components = 2)

    if data is not None:
      data2 = reducer.fit_transform(data)
      if plot_data == True:
        if data_color is None:
          plt.scatter(data2[:,0],data2[:,1], linewidth = data_linewidth , alpha = data_transparency_alpha)# ,cmap=plt.cm.Paired) # ,c=np.array(irx) 
          plt.xlabel(str_dim_reduction+'1')
          plt.ylabel(str_dim_reduction+'2')
        else:
          #plt.scatter(data2[:,0],data2[:,1] ,cmap=plt.cm.Paired,c= data_color, linewidth = data_linewidth, alpha = data_transparency_alpha ) 
          sns.scatterplot( x=data[:,0], y=data[:,1], hue = data_color )

          plt.xlabel(str_dim_reduction+'1')
          plt.ylabel(str_dim_reduction+'2')
    else:
      reducer.fit(nodes_positions)

    nodes_positions2 = reducer.transform( nodes_positions )
  else:
    if plot_data == True:
      if data is not None:
        if data_color is None:
          plt.scatter(data[:,0],data[:,1] , linewidth = linewidth, alpha = data_transparency_alpha )# ,cmap=plt.cm.Paired) # ,c=np.array(irx) 
        else:
          plt.scatter(data[:,0],data[:,1] ,cmap=plt.cm.Paired,c= data_color , linewidth = data_linewidth, alpha = data_transparency_alpha ) 
          #sns.scatterplot( x=data[:,0], y=data[:,1], hue = data_color )

    nodes_positions2 = nodes_positions

  plt.scatter(nodes_positions2[:,0],nodes_positions2[:,1],c = graph_color, linewidth = graph_linewidth)#, cmap=plt.cm.Paired)

  edgeCount = edges.shape[0]
  for k in range(edgeCount):
    n0 = edges[k,0]
    n1 = edges[k,1]
    x_line = [ nodes_positions2[n0,0],  nodes_positions2[n1,0] ]
    y_line = [ nodes_positions2[n0,1],  nodes_positions2[n1,1] ]
    plt.plot(x_line, y_line, graph_color, linewidth = graph_linewidth) # 'black')

  if showNodeNumbers:
    for i in range(nodes_positions2.shape[0]):
      plt.text(nodes_positions2[i,0],nodes_positions2[i,1],str(i),FontSize=20,bbox=dict(facecolor='grey', alpha=0.5))    
    
edges = np.array([ [0,1],[1,2],[2,0] ] )
nodes_positions = np.random.rand(3,10) # 3 points in 10d space
plot_graph(edges, nodes_positions)
plt.title('Example graph plot with  plot_graph function')
plt.show()

# Trajectories by ElpiGraph

# No pruning, No extension 

In [None]:
import time
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4colors = df_full[f].values

for nnodes in [10,20, 25, 30, 40, 50, 60, 100]:
    t0 = time.time()
    tree_elpi = elpigraph.computeElasticPrincipalTree(X,nnodes,# ,drawPCAView=True,
                                                      alpha=0.01,Mu=0.1,Lambda=0.05,
                                                      FinalEnergy='Penalized')
    tree = tree_elpi[0]

    nodes_positions = tree['NodePositions']
    edges = tree['Edges'][0]

    plt.figure(figsize = (20,8))
    plot_graph(edges, nodes_positions, data = X , data_color = vec4colors) # df[f]) # 'tab:blue', data_transparency_alpha = 0.3 )
    plt.title('nnodes ' + str(nnodes))
    plt.show()
    print(np.round(time.time()-t0,1),'seconds passed. nnodes = ', nnodes)

In [None]:
!pip install trimap

import sys
sys.path.insert(0,'/kaggle/input/breast-cancer-omics-bulk-data/code/')# "/path/to/your/package_or_module")
print(sys.path)

from clintraj_qi import *
from clintraj_eltree import *
from clintraj_util import *
from clintraj_ml import *
from clintraj_optiscale import *

## Prune tree (i.e. cut edges of len = 1 )

In [None]:
import time
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4colors = df_full[f].values

for nnodes in [10,20, 25, 30, 40, 50, 60, 100]:
    t0 = time.time()
    tree_elpi = elpigraph.computeElasticPrincipalTree(X,nnodes,# ,drawPCAView=True,
                                                      alpha=0.01,Mu=0.1,Lambda=0.05,
                                                      FinalEnergy='Penalized')
    tree = tree_elpi[0]
    # some additional pruning of the graph
    prune_the_tree(tree)

    nodes_positions = tree['NodePositions']
    edges = tree['Edges'][0]

    plt.figure(figsize = (20,8))
    plot_graph(edges, nodes_positions, data = X , data_color = vec4colors) # df[f]) # 'tab:blue', data_transparency_alpha = 0.3 )
    plt.title('nnodes ' + str(nnodes))
    plt.show()
    print(np.round(time.time()-t0,1),'seconds passed. nnodes = ', nnodes)
    
    


## Prune and then extend to reach extreme data points 

In [None]:
import time
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4colors = df_full[f].values

for nnodes in [10,20, 25, 30, 40, 50, 60, 100]:
    t0 = time.time()
    tree_elpi = elpigraph.computeElasticPrincipalTree(X,nnodes,# ,drawPCAView=True,
                                                      alpha=0.01,Mu=0.1,Lambda=0.05,
                                                      FinalEnergy='Penalized')
    tree = tree_elpi[0]
    # some additional pruning of the graph
    prune_the_tree(tree)
    
    # extend the leafs to reach the extreme data points
    tree = ExtendLeaves_modified(X, tree, Mode = "QuantDists", ControlPar = .5, DoSA = False)
    

    nodes_positions = tree['NodePositions']
    edges = tree['Edges'][0]

    plt.figure(figsize = (20,8))
    plot_graph(edges, nodes_positions, data = X , data_color = vec4colors) # df[f]) # 'tab:blue', data_transparency_alpha = 0.3 )
    plt.title('nnodes ' + str(nnodes))
    plt.show()
    print(np.round(time.time()-t0,1),'seconds passed. nnodes = ', nnodes)
    
    


## Prune and then extend to reach extreme data points 

In [None]:
import time
f = 'Pam50 + Claudin-low subtype' #   'pam50_+_claudin-low_subtype'
vec4colors = df_full[f].values

for nnodes in range(15,25):
    t0 = time.time()
    tree_elpi = elpigraph.computeElasticPrincipalTree(X,nnodes,# ,drawPCAView=True,
                                                      alpha=0.01,Mu=0.1,Lambda=0.05,
                                                      FinalEnergy='Penalized')
    tree = tree_elpi[0]
    # some additional pruning of the graph
    prune_the_tree(tree)
    
    # extend the leafs to reach the extreme data points
    tree = ExtendLeaves_modified(X, tree, Mode = "QuantDists", ControlPar = .5, DoSA = False)
    

    nodes_positions = tree['NodePositions']
    edges = tree['Edges'][0]

    plt.figure(figsize = (20,8))
    plot_graph(edges, nodes_positions, data = X , data_color = vec4colors) # df[f]) # 'tab:blue', data_transparency_alpha = 0.3 )
    plt.title('nnodes ' + str(nnodes))
    plt.show()
    print(np.round(time.time()-t0,1),'seconds passed. nnodes = ', nnodes)
    
    
