# Summary: This Notebook is used for OAK-POPLAR data preprocessing and prepare dataset for preditive models. It has four major steps:
## 1) Load knowledge graph (subgraph);
## 2) Load patient features and overall survival;
## 3) Find common gene panel between OAK and POPLAR;
## 4) Patient graph embedding generation and save the dataset

In [None]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
from karateclub import SocioDim
from karateclub import EgoNetSplitter
import networkx as nx


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sksurv.util import Surv
from sksurv.datasets import load_gbsg2
from sksurv.preprocessing import OneHotEncoder
from pysurvival.models.survival_forest import RandomSurvivalForestModel
from lifelines.utils import concordance_index as lfcindex
from sklearn.tree import DecisionTreeRegressor


In [None]:
import random
randomSeed=10
np.random.seed(randomSeed);


In [None]:
def display_summary(df, name:str=None):
    """Displays the head and summary statistics of a DataFrame.
    """
    if name:
        print(f"Summary of data for: {name}")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"\nSample of data:")
    display(df.sample(5))

# 1. Load bikg subgraph

### Here is to load the knowledge graph subgraph and prepare it in such format "source_label (e.g. gene) - target_label (e.g. gene, pathway)"

In [None]:
subgraph_edges = pd.read_csv('../Data/inputs/BIKGImmuneGeneGeneSubgraphs/subgraph4_1.csv', sep=',')
subgraph_edges=subgraph_edges[['source_label','target_label']]

subgraph_edges

# 2. Load patient survival information

### This block is to load patient survival information, the patient survival information should in format of "SAMPLE_ID | OS | OS.CNSR ", In OS.CNSR: 0-censored; 1-deceased

In [None]:

OAKDatasetForTraining = pd.read_csv('../Data/inputs/inputDatasetOAK/OAK-IO.csv', sep=',')
OAKDatasetForTraining['SAMPLE_ID']=['Patient'+str(i) for i in range(0,len(OAKDatasetForTraining))]
OAKDatasetForTraining.set_index('SAMPLE_ID',inplace=True)

survival_outcomes_OAK=OAKDatasetForTraining[['OS', 'OS.CNSR']]

survival_outcomes_OAK

### The following block is to load patient genomic mutation features

In [None]:
genomicFeaturesColumn=[col for col in OAKDatasetForTraining.columns if 'molecular_' in col]
genomic_features_OAK=OAKDatasetForTraining.loc[:,genomicFeaturesColumn]
genomic_features_OAK.replace(np.nan,0,inplace=True)
genomic_features_OAK[genomic_features_OAK > 0] = 1
genomic_features_OAK.columns=[name[10:] for name in list(genomic_features_OAK.columns)]
#genomic_features_OAK

### Patient cohort statistics. For this OAK dataset, there are 324 patients. The gene panel contains 396 genes

In [None]:
display_summary(genomic_features_OAK, "patient genomic features (OAK)")


In [None]:
PoplarDatasetForHoldout = pd.read_csv('../Data/inputs/inputDatasetPoplar/poplar_mutated-genes.csv', sep=',')
PoplarDatasetForHoldout.set_index('PtID',inplace=True)
PoplarDatasetForHoldout[['OS.CNSR']]=1-PoplarDatasetForHoldout[['OS.CNSR']]
IOtreated = PoplarDatasetForHoldout[PoplarDatasetForHoldout['TRT01P'].isin(['MPDL3280A'])]

survival_outcomes_Poplar=IOtreated[['OS','OS.CNSR']]
survival_outcomes_Poplar

In [None]:
genomic_features_Poplar=IOtreated.iloc[:,24:]
genomic_features_Poplar[genomic_features_Poplar>0]=1
#genomic_features_Poplar

### Patient cohort statistics. For this POPLAR dataset, there are 102 patients. The gene panel contains 363 genes

In [None]:
display_summary(genomic_features_Poplar, "patient genomic features (POPLAR)")


# 3. find common gene list among two dataset

In [None]:
OAK_panel=list(genomic_features_OAK.columns)
Poplar_panel=list(genomic_features_Poplar.columns)

In [None]:
len(OAK_panel)

In [None]:
len(Poplar_panel)

In [None]:
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [None]:
commonPanel=intersection(OAK_panel, Poplar_panel)
len(commonPanel)

# 4. Learn patient embedding using bikg_edges and gene_patient_edges

In [None]:
subgraph_edges

### This function is used to convert patient genomic feature matrix into format of graph representation in format of "source_label (e.g. patient id) - target_label (e.g. gene mutation)"


In [None]:
def getGenePatientEdges(genomic_features):
    (x_ind,y_ind)=np.where(genomic_features==1)
    gene_patient_edges=pd.DataFrame(list(zip(genomic_features.index[x_ind],genomic_features.columns[y_ind])),columns=['source_label','target_label'])
    return gene_patient_edges

In [None]:
gene_patient_edges_holdout=getGenePatientEdges(genomic_features_Poplar.loc[:,commonPanel])
gene_patient_edges_dataframe=getGenePatientEdges(genomic_features_OAK.loc[:,commonPanel])

### The following function is to fuse "patient-gene" graph with knowledge graph subgraph containing of "gene-gene" or "gene-pathway"; Then the graph embedding algorithm (e.g. SocioDim) is applied to learn embedding for each gene node

In [None]:

def learnBIKGGraphEmbeddings(subgraph_edges,gene_patient_edges):
    bikg_edges = pd.concat([subgraph_edges, gene_patient_edges])
    nodes = set(bikg_edges["source_label"].values.tolist()).union(
                set(bikg_edges["target_label"].values.tolist())
            )
    node_to_num = {index: node for node, index in enumerate(nodes)}
    bikg_edges["source_num_id"] = bikg_edges["source_label"].map(
                lambda x: node_to_num[x]
            )
    bikg_edges["target_num_id"] = bikg_edges["target_label"].map(
        lambda x: node_to_num[x]
    )
    bikg_edges = bikg_edges[["source_num_id", "target_num_id"]].values.tolist()
    dim=16
    upstream_model = SocioDim(dimensions=dim,seed=23)
    upstream_model.fit(nx.from_edgelist(bikg_edges))
    BIKG_graph_embedding = upstream_model.get_embedding()
    return BIKG_graph_embedding, node_to_num




In [None]:
[BIKG_graph_embedding,node_to_num]=learnBIKGGraphEmbeddings(subgraph_edges,gene_patient_edges_dataframe)


### patient representation is generated as aggregation of gene embedding based on patient genomic mutation profile.

In [None]:
def buildPatientEmbeddingUsingGeneEmbedding(gene_patient_edges,BIKG_graph_embedding,node_to_num):
    gene_patient_edges_grouped=gene_patient_edges.groupby('source_label')['target_label'].apply(list).reset_index(name='target_labels')
    dim=BIKG_graph_embedding.shape[1]
    patients_embeddings_feature=[]
    for index, row in gene_patient_edges_grouped.iterrows():
        target_labels=row['target_labels']
        patient_embeddings=[0]*dim
        target_labels_length=len(target_labels)
        for gene in target_labels:
            if gene in node_to_num:
                patient_embeddings=patient_embeddings+np.array(BIKG_graph_embedding[node_to_num[gene], :])
            else:
                patient_embeddings=patient_embeddings+np.array([0]*dim)
        patients_embeddings_feature.append(patient_embeddings/target_labels_length)
    emb_features = ['X{}'.format(i) for i in range(dim)]
    patients_embeddings_feature = pd.DataFrame(patients_embeddings_feature, index=gene_patient_edges_grouped['source_label'], columns = emb_features)
    return patients_embeddings_feature

In [None]:
patient_embedding_holdout=buildPatientEmbeddingUsingGeneEmbedding(gene_patient_edges_holdout,BIKG_graph_embedding,node_to_num)
patient_embedding_dataframe=buildPatientEmbeddingUsingGeneEmbedding(gene_patient_edges_dataframe,BIKG_graph_embedding,node_to_num)



### save patient embedding results


In [None]:
output_dir='../Data/outputs/RobustTestUsingPoplar/'
patient_embedding_dataframe.to_csv(output_dir+str(randomSeed)+'/patient_embedding_dataframe.csv') 
patient_embedding_holdout.to_csv(output_dir+str(randomSeed)+'/patient_embedding_holdout.csv') 

In [None]:
def getSurvivalInformation(survival_outcomes,patient_embedding):
    mergedDF=survival_outcomes.join(patient_embedding,how='right')
    y = Surv.from_dataframe("OS.CNSR","OS",  mergedDF)
    return y

In [None]:
y_dataframe=getSurvivalInformation(survival_outcomes_OAK,patient_embedding_dataframe)
y_holdout=getSurvivalInformation(survival_outcomes_Poplar,patient_embedding_holdout)

In [None]:
np.save(output_dir+str(randomSeed)+'/y_dataframe.npy',y_dataframe) 
np.save(output_dir+str(randomSeed)+'/y_holdout.npy',y_holdout) 

In [None]:

TMB=IOtreated[['btmb']]
filterTMB=TMB.join(patient_embedding_holdout,how='right')
tmbValue=list(filterTMB['btmb'])
np.save(output_dir+str(randomSeed)+'/tmbValue.npy',tmbValue) 

In [None]:
genomic_features_OAK.to_csv(output_dir+str(randomSeed)+'/genomic_features_OAK.csv') 
genomic_features_Poplar.to_csv(output_dir+str(randomSeed)+'/genomic_features_Poplar.csv') 
