In [1]:
import numpy as np
import pandas as pd
import random
import networkx as nx
import collections
from scipy.stats import wilcoxon, pearsonr
from statsmodels.stats.multitest import multipletests

random.seed(1999)
np.random.seed(1999)

In [2]:
data = pd.read_csv("../../dataset/CRISPRGeneEffect.csv", index_col=0)
mutation = pd.read_csv("../../dataset/OmicsSomaticMutations.csv")
mutation = mutation[mutation['VariantType'] == 'SNP']
mutation = mutation[['Chrom', 'Pos', 'HugoSymbol', 'ModelID']]

clinvar = pd.read_csv("../../dataset/ClinVar/ClinVar_variant_summary.txt", delimiter='\t')
clinvar = clinvar[clinvar['Assembly'] == "GRCh38"]
clinvar = clinvar[clinvar['Type'] == "single nucleotide variant"]
pathogenicity = pd.read_csv("../../dataset/ClinVar/pathogenicity.csv", index_col=0)
pathogenetic_type = list(pathogenicity[pathogenicity['Pathogenicity'] == 'Y']['Category'])
mutation_patho = clinvar[clinvar['ClinicalSignificance'].isin(pathogenetic_type)]
mutation_patho = mutation_patho[["Chromosome", "Start", "GeneSymbol"]]
mutation_patho['Chromosome'] = mutation_patho['Chromosome'].apply(lambda x: 'chr' + str(x))
mutation['ID'] = mutation['Chrom'] + '-' + mutation['Pos'].astype(str)
mutation_patho['ID'] = mutation_patho['Chromosome'] + '-' + mutation_patho['Start'].astype(str)
mutation = mutation.sort_values(by=['ID'])
mutation_patho = mutation_patho.sort_values(by=['ID'])
mutation = pd.merge(mutation, mutation_patho, on='ID', how='inner')
mutation = mutation[['ModelID', 'HugoSymbol']]
mutation = mutation.sort_values(["HugoSymbol", "ModelID"])
mutation = mutation.drop_duplicates()
mutation.index = range(mutation.shape[0])

  mutation = pd.read_csv("../../dataset/OmicsSomaticMutations.csv")
  clinvar = pd.read_csv("../../dataset/ClinVar/ClinVar_variant_summary.txt", delimiter='\t')


In [3]:
gene_col = list(data.columns)
for i in range(len(gene_col)):
    gene_col[i] = gene_col[i].split(' ')[0]
data.columns = gene_col
gene_info = pd.read_csv("../../dataset/InputGene/ScreenedGene.csv")
gene_info = gene_info.drop_duplicates(subset=['From'], keep='first')
data = data[gene_info.iloc[:, 0]]
data.columns = gene_info.iloc[:, 1]
ifnull = data.isnull().sum()
data = data[ifnull[ifnull == 0].index]

In [4]:
mut_gene = pd.read_csv("../../dataset/InputGene/temp.csv", header=None)
label = pd.DataFrame(data=0, index=mut_gene.iloc[:, 0], columns=data.index)
for i in range(label.shape[0]):
    mut_sub = mutation[mutation['HugoSymbol'] == mut_gene.iloc[i, 0]]
    model_sub = list(set(mut_sub['ModelID']) & set(label.columns))
    label.iloc[i][model_sub] = 1

In [5]:
def get_gene_pathways(input_file, species='human'):
    dt = pd.read_table(input_file, header=None)
    ensembl = dt[((dt[0] >= 'ENSG00000000000000') & (dt[0] <= 'ENSG00099999999999'))
                 | ((dt[0] >= 'ENSP00000000000000') & (dt[0] <= 'ENSP00099999999999'))
                 | ((dt[0] >= 'ENST00000000000000') & (dt[0] <= 'ENST00099999999999'))]

    ensembl = ensembl.iloc[:, 0:2]
    ensembl.columns = ['gene', 'group']
    ensembl = pd.DataFrame(ensembl, columns=['group', 'gene'])
    ensembl.index = range(0, ensembl.shape[0])

    for i in range(0, ensembl.shape[0]):
        ensembl.iloc[i, 1] = 'ENSG000' + ensembl.iloc[i, 1][7:18]

    ensembl = ensembl.drop_duplicates()
    ensembl.index = range(0, ensembl.shape[0])
    return ensembl

In [6]:
pathway_genes = get_gene_pathways("../../dataset/reactome/Ensembl2Reactome_All_Levels.txt", species='human')
pathway_names = '../../dataset/reactome/ReactomePathways.txt'
relations_file_name = '../../dataset/reactome/ReactomePathwaysRelation.txt'
root_name = [0, 1]

In [7]:
class Reactome():

    def __init__(self, pathway_names, pathway_genes, relations_file_name, species):
        self.pathway_names = self.load_names(pathway_names)
        self.pathway_genes = pathway_genes
        self.hierarchy = self.load_hierarchy(relations_file_name)
        self.species = species

    def load_names(self, pathway_names):
        filename = pathway_names
        df = pd.read_csv(filename, sep='\t')
        df.columns = ['reactome_id', 'pathway_name', 'species']
        return df

    def load_hierarchy(self, relations_file_name):
        filename = relations_file_name
        df = pd.read_table(filename, header=None)
        df.columns = ['child', 'parent']
        return df


class ReactomeNetwork():

    def __init__(self, pathway_names, pathway_genes, relations_file_name, species):
        self.reactome = Reactome(pathway_names, pathway_genes, relations_file_name,
                                 species)  
        self.netx = self.get_reactome_networkx()

    def get_reactome_networkx(self):
        if hasattr(self, 'netx'):
            return self.netx
        hierarchy = self.reactome.hierarchy

        if self.reactome.species == 'mouse':
            abbr = 'MMU'
        elif self.reactome.species == 'human':
            abbr = 'HSA'
        elif self.reactome.species == 'rat':
            abbr = 'RNO'
        species_hierarchy = hierarchy[hierarchy['child'].str.contains(abbr)]
        net = nx.from_pandas_edgelist(species_hierarchy, 'child', 'parent', create_using=nx.DiGraph())
        net.name = 'reactome'

        # add root node
        roots = [n for n, d in net.in_degree() if d == 0]
        root_node = 'root'
        edges = [(root_node, n) for n in roots]
        net.add_edges_from(edges)

        return net

In [8]:
reactome_net = ReactomeNetwork(pathway_names, pathway_genes, relations_file_name, "human")
original_network = reactome_net.netx
genes_df = reactome_net.reactome.pathway_genes

In [9]:
# 获取节点数量
num_nodes = original_network.number_of_nodes()

# 获取边的数量
num_edges = original_network.number_of_edges()

# 输出节点和边的数量
print(f"节点数量: {num_nodes}")
print(f"边的数量: {num_edges}")

节点数量: 2581
边的数量: 2627


In [10]:
# 找到所有的叶子节点
def find_leaves(graph):
    leaves = [node for node in graph.nodes() if graph.out_degree(node) == 0]
    return leaves

# 使用 DFS 从每个节点开始，找到它最终连接到的所有叶子节点
def find_all_leaf_nodes(graph):
    leaves = find_leaves(graph)
    leaf_map = {}

    def dfs(node, path):
        if node in leaves:
            for start_node in path:
                if start_node not in leaf_map:
                    leaf_map[start_node] = set()
                leaf_map[start_node].add(node)
        else:
            for neighbor in graph.successors(node):
                dfs(neighbor, path + [node])

    for node in graph.nodes():
        if node not in leaf_map:  # 确保每个节点都被处理
            dfs(node, [])
    
    # 确保每个叶子节点自己在自己的映射中
    for leaf in leaves:
        if leaf not in leaf_map:
            leaf_map[leaf] = set()
        leaf_map[leaf].add(leaf)

    return leaf_map
    
def add_genes_to_network(network, genes_df, leaf_map):
    for index, row in genes_df.iterrows():
        group = row['group']  # Pathway name
        gene = row['gene']    # Gene name
        
       # Add the gene as a node if it doesn't already exist
        if not network.has_node(gene):
            network.add_node(gene, type='gene')  # You can add more attributes if needed
        
        # Find leaf nodes for the given pathway using the original leaf nodes map
        if group in leaf_map:
            leaf_nodes = leaf_map[group]
            # Add an edge from each leaf node to the gene
            for leaf in leaf_nodes:
                network.add_edge(leaf, gene)
    
    return network

In [11]:
leaf_map = find_all_leaf_nodes(original_network)
final_network = add_genes_to_network(original_network, genes_df, leaf_map)

In [12]:
# 获取节点数量
num_nodes = final_network.number_of_nodes()

# 获取边的数量
num_edges = final_network.number_of_edges()

# 输出节点和边的数量
print(f"节点数量: {num_nodes}")
print(f"边的数量: {num_edges}")


节点数量: 65195
边的数量: 18166335


In [41]:
# def add_duplicated_edges(G, node, n_levels):
#     edges = []
#     source = node
#     for l in range(n_levels):
#         target = node + '_copy' + str(l + 1)
#         edge = (source, target)
#         source = target
#         edges.append(edge)

#     G.add_edges_from(edges)
#     return G, target


# def add_gene_edges(G, node, pathways, genes_df):
#     genes = []
#     if type(pathways) == str:
#         genes = genes + list(genes_df[genes_df['group'] == pathways]['gene'])
#     else:
#         for i in range(len(pathways)):
#             genes = genes + list(genes_df[genes_df['group'] == pathways[i]]['gene'])
#     genes = list(set(genes))
#     edges = []
#     source = node
#     for target in genes:
#         edge = (source, target)
#         edges.append(edge)
#     G.add_edges_from(edges)
#     return G


# def get_nodes_at_level(net, distance):
#     nodes = set(nx.ego_graph(net, 'root', radius=distance))  # get all nodes within distance around the query node
#     if distance >= 1.:
#         nodes -= set(nx.ego_graph(net, 'root',
#                                   radius=distance - 1))  # remove nodes that are not at the specified distance but closer
#     return list(nodes)


# def gene_mapping(gene, df):
#     inter_gene = list(set(gene) & set(df['gene']))
#     genedict = {}
#     genelist = [df.iloc[0, 0]]
#     genedict[df.iloc[0, 1]] = genelist
#     for i in range(1, df.shape[0]):
#         if df.iloc[i, 1] == df.iloc[i - 1, 1]:
#             genelist.append(df.iloc[i, 0])
#         else:
#             genedict[df.iloc[i - 1, 1]] = genelist
#             genelist = [df.iloc[i, 0]]
#     mappingdf = pd.DataFrame(data=None, columns=['group', 'gene'])
#     for j in range(len(inter_gene)):
#         mappingdf_iter = {'group': genedict[inter_gene[j]],
#                           'gene': [inter_gene[j]] * len(genedict[inter_gene[j]])}
#         mappingdf_iter = pd.DataFrame(mappingdf_iter)
#         mappingdf = pd.concat([mappingdf, mappingdf_iter])
#     return mappingdf


In [42]:
# data_gene_in = data.columns.tolist()
# species = 'human'
# n_hidden = 3

# reactome_net = ReactomeNetwork(pathway_names, pathway_genes, relations_file_name, species)
# genes_df = reactome_net.reactome.pathway_genes
# genes_df = gene_mapping(data_gene_in, genes_df)

# original_network = reactome_net.netx
# original_terminal_nodes = [n for n, d in original_network.out_degree() if d == 0]
# in_genes_df = [False for x in range(0, len(original_terminal_nodes))]
# while in_genes_df.count(False) > 0:
#     in_genes_df = [False for x in range(0, len(original_terminal_nodes))]
#     for i in range(len(original_terminal_nodes)):
#         if original_terminal_nodes[i] in genes_df['group'].to_list():
#             in_genes_df[i] = True
#     for i in range(len(original_terminal_nodes)):
#         if in_genes_df[i] == False:
#             original_network.remove_node(original_terminal_nodes[i])
#     original_terminal_nodes = [n for n, d in original_network.out_degree() if d == 0]

# sub_graph = nx.ego_graph(original_network, 'root',
#                          radius=n_hidden)  # subgraph of neighbors centered at node "root" <= a given radius (n_level).
# sub_terminal_nodes = [n for n, d in sub_graph.out_degree() if d == 0]
# for node in sub_terminal_nodes:
#     distance = len(nx.shortest_path(sub_graph, source='root',
#                                     target=node))  # len of distance: num of nodes in the shortest path
#     if (distance == n_hidden + 1) & (node not in original_terminal_nodes):
#         part_graph = nx.ego_graph(original_network, node, radius=100)
#         corresponding_terminal_nodes = [n for n, d in part_graph.out_degree() if d == 0]
#         sub_graph = add_gene_edges(sub_graph, node, corresponding_terminal_nodes, genes_df)
#     elif (distance == n_hidden + 1) & (node in original_terminal_nodes):
#         corresponding_terminal_nodes = node
#         sub_graph = add_gene_edges(sub_graph, node, corresponding_terminal_nodes, genes_df)
#     elif distance <= n_hidden:
#         diff = n_hidden - distance + 1
#         sub_graph, copy_node = add_duplicated_edges(sub_graph, node, diff)
#         sub_graph = add_gene_edges(sub_graph, copy_node, node, genes_df)
# final_network = sub_graph

In [13]:
level_dict = {}
roots = [node for node in final_network.nodes if final_network.in_degree(node) == 0]
for root in roots:
    path_lengths = nx.single_source_shortest_path_length(final_network, root)
    for node, length in path_lengths.items():
        if node in level_dict:
            level_dict[node] = min(level_dict[node], length)
        else:
            level_dict[node] = length

In [14]:
levels = collections.defaultdict(list)
for node, level in level_dict.items():
    if not node.startswith("ENSG"):
        levels[level].append(node)

In [15]:
mut_gene['ensembl'] = mut_gene[0].map(gene_info.set_index('From')['To'])
mut_gene.columns = ['gene name', 'ensembl']
mut_gene = mut_gene.iloc[0:10, :]
mut_gene = mut_gene.sort_values(by='gene name')
mut_gene.index = range(10)

In [16]:
p1 = pd.read_csv("../../result/supervised/pathway/pathways_BRAF.csv", index_col = 0)
p2 = pd.read_csv("../../result/supervised/pathway/pathways_CDH3.csv", index_col = 0)
p3 = pd.read_csv("../../result/supervised/pathway/pathways_HRAS.csv", index_col = 0)
p4 = pd.read_csv("../../result/supervised/pathway/pathways_KRAS.csv", index_col = 0)
p5 = pd.read_csv("../../result/supervised/pathway/pathways_NRAS.csv", index_col = 0)
p6 = pd.read_csv("../../result/supervised/pathway/pathways_PIK3CA.csv", index_col = 0)
p7 = pd.read_csv("../../result/supervised/pathway/pathways_SCN5A.csv", index_col = 0)
p8 = pd.read_csv("../../result/supervised/pathway/pathways_SERPINH1.csv", index_col = 0)
p9 = pd.read_csv("../../result/supervised/pathway/pathways_TP53.csv", index_col = 0)
p10 = pd.read_csv("../../result/supervised/pathway/pathways_VDR.csv", index_col = 0)

In [17]:
pathway = [p1, p2, p3, p4, p5, p6, p7, p8, p9, p10]

In [18]:
path_length = {}
for i in range(0, 10):
    path_length[mut_gene.iloc[i, 0]] = pd.DataFrame(columns=['experimental group', 'control group'])
    for p in pathway[i]['pathway']:
        try:
            exp_distance = nx.shortest_path_length(final_network, source=p, target=mut_gene.iloc[i, 1])
        except nx.NetworkXNoPath:
            exp_distance = float('inf') 
        rand_pathway = levels[level_dict[p]][random.randint(0, len(levels[level_dict[p]]) - 1)]
        try:
            ctrl_distance = nx.shortest_path_length(final_network, source=rand_pathway, target=mut_gene.iloc[i, 1])
        except nx.NetworkXNoPath:
            ctrl_distance = float('inf') 
        path_length[mut_gene.iloc[i, 0]].loc[p] = [exp_distance, ctrl_distance]

In [19]:
for k in path_length.keys():
    path_length[k].to_csv(k + "_path_length.csv")

In [20]:
undirected_network = final_network.to_undirected()

In [21]:
p_adj_all = pd.DataFrame(index=mut_gene['gene name'], columns=range(5))
correlation_all = pd.DataFrame(index=mut_gene['gene name'], columns=range(5))
path_length = {}
for i in range(0, 10):
    p_value, correlation = [], []
    for times in range(5):
        path_length[mut_gene.iloc[i, 0]] = pd.DataFrame(columns=['experimental group', 'control group'])
        for p in pathway[i]['pathway']:
            exp_distance = nx.shortest_path_length(undirected_network, source=p, target=mut_gene.iloc[i, 1])
            rand_pathway = levels[level_dict[p]][random.randint(0, len(levels[level_dict[p]]) - 1)]
            ctrl_distance = nx.shortest_path_length(undirected_network, source=rand_pathway, target=mut_gene.iloc[i, 1])
            path_length[mut_gene.iloc[i, 0]].loc[p] = [exp_distance, ctrl_distance]
        _, pval = wilcoxon(path_length[mut_gene.iloc[i, 0]]['experimental group'], path_length[mut_gene.iloc[i, 0]]['control group'])
        corr, _ = pearsonr((path_length[mut_gene.iloc[i, 0]]['experimental group'] - path_length[mut_gene.iloc[i, 0]]['control group']) / path_length[mut_gene.iloc[i, 0]]['experimental group'], range(1, len(pathway[i]['pathway']) + 1))
        p_value.append(pval)
        correlation.append(corr)
    _, p_value_adj, _, _ = multipletests(p_value, alpha=0.05, method='fdr_bh')
    p_adj_all.loc[mut_gene.iloc[i, 0]] = p_value_adj
    correlation_all.loc[mut_gene.iloc[i, 0]] = correlation

In [22]:
p_adj_all

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRAF,0.125047,0.46353,0.015377,0.799195,0.388141,0.13141,0.707594,0.027831,0.0345,0.536593
CDH3,0.003775,0.003172,0.006747,0.572606,0.133991,0.593869,0.012039,0.003111,0.001839,0.431719
HRAS,0.164354,0.177398,0.037949,0.204887,0.56368,0.018658,0.058079,0.013519,0.222223,0.06227
KRAS,0.051801,0.000948,0.01543,0.000108,0.026087,0.004961,0.000447,0.000936,0.003473,0.000344
NRAS,0.252034,0.613662,0.460742,0.097022,0.044591,0.034364,0.08511,0.373363,0.152147,0.003104
PIK3CA,0.053721,0.341248,0.160946,0.637825,0.028501,0.000982,0.106467,0.432259,0.725663,0.179312
SCN5A,0.036126,0.03094,0.038611,0.075553,0.009233,0.078602,0.099907,0.022456,0.028034,0.363139
SERPINH1,0.862406,0.927034,0.274964,0.925459,0.335521,0.582693,0.903071,0.491551,0.45699,0.259713
TP53,0.95177,0.907733,0.250596,0.284301,0.356166,0.361591,0.584104,0.807429,0.43743,0.257568
VDR,0.109771,0.013385,0.043453,0.016108,0.030722,0.016923,0.027754,0.029565,0.004604,0.002066


In [23]:
correlation_all

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRAF,0.005815,0.04799,0.073674,-0.028375,0.042578,0.067445,0.100688,0.037487,0.045336,0.007804
CDH3,-0.014054,0.025068,-0.014793,-0.095549,-0.034256,0.093862,-0.013984,-0.063431,-0.010184,-0.06519
HRAS,-0.043166,-0.008696,-0.073381,0.072629,-0.012993,0.003121,0.007234,0.001697,-0.00935,0.06805
KRAS,-0.035614,-0.133171,-0.021351,-0.114536,-0.040312,0.048684,-0.133687,-0.055568,0.01439,-0.015135
NRAS,0.058169,0.02669,-0.054718,0.024983,0.144563,0.001725,0.07261,0.085081,0.137342,0.092308
PIK3CA,0.171811,0.023636,-0.00738,0.129893,0.195845,0.124946,0.19581,0.059281,0.213509,0.116204
SCN5A,-0.097598,-0.074352,-0.135635,-0.085725,-0.163713,-0.062927,-0.163853,-0.087931,-0.164325,-0.114969
SERPINH1,-0.159285,-0.167137,-0.126108,-0.184493,-0.17585,-0.158564,-0.1864,-0.141712,-0.123972,-0.146011
TP53,0.024165,-0.043577,-0.056507,-0.133842,0.019625,0.047115,-0.036224,-0.038402,0.045475,-0.11877
VDR,-0.096051,-0.102953,-0.090849,0.003287,-0.000781,-0.070694,-0.030342,-0.074312,-0.075124,-0.015535


In [58]:
p_adj_all.to_csv("p_adj.csv")
correlation_all.to_csv("corr.csv")