In [36]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
import json
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [15]:
relation_file = "../data/ReactomePathwaysRelation.txt"
pathway_name = "../data/ReactomePathways.txt"

In [38]:
def generate_tree(relation_file = relation_file):
    rel_df = pd.read_csv(relation_file, sep = "\t", header = None, index_col = 0, names=['id'])
    real_names = pd.read_csv(pathway_name, sep='\t', index_col=0, names=['pathway', 'name', 'organism'])
    
    
    cut = rel_df.index.str.contains('MMU') & rel_df['id'].str.contains('MMU')
    rel_df = rel_df.loc[cut]
    
    namelist = []
    for i in rel_df.index:
        conversion = real_names.loc[i, 'name']
        namelist.append(conversion)
        
    childlist = []   
    for i in rel_df['id']:
        conversion = real_names.loc[i, 'name']
        childlist.append(conversion)
    names = pd.DataFrame()
    names['parent'] = namelist
    names['child'] = childlist
    
    
    G = nx.DiGraph()
    G.add_edges_from(names.values)
    roots = [n for n,d in G.in_degree() if d==0]
    
    roots_df = pd.DataFrame(columns = [['parentId', 'id']])
    roots_df['id'] = roots
    roots_df['parentId'] = 'Human'
    
    roots_df = pd.DataFrame(roots_df.values, columns = ['parentId', 'id'])
    rel_df = pd.DataFrame(names.values, columns = ['parentId', 'id'])
    
    tree = roots_df.append(rel_df)
    return tree

rel_df = generate_tree()

In [34]:
def default(o):
     if isinstance(o, np.integer): return int(o)
     raise TypeError


In [151]:
def sunburst(in_df, outname = 'sun_tree.json'):
    # in_df has as indexes the pathway reactome ID, 'value' as the colors to be plot, 'ngenes' as the width, and 'descr' as the description of the pathway
    
    #in_df = in_df.loc[[x for x in in_df.index if 'MMU' in x]]
    max_val = in_df['value'].max()
    in_df['max_val'] = max_val
    sorted_in_df = in_df.sort_values(by='value', ascending = False)
    sorted_in_df['C'] = np.arange(len(sorted_in_df))+1
    in_df = sorted_in_df
    print(in_df)
    topPaths = rel_df.loc[(rel_df['parentId'] == 'Human'), 'id']
    homoNgenes = np.sum(in_df.loc[[x in topPaths.tolist() for x in in_df.index],'ngenes'])
    homoNode = pd.DataFrame([[0,homoNgenes,"Human", max_val]], columns = ["value", "ngenes", "Organism", 'max_val']).xs(0)
    homoNode.name = 'Human'
    
    in_df = in_df.append(homoNode)
    print(in_df)
    
    topDict = in_df.to_dict()
    
    
    pathways = in_df.index
    
    n_path = len(pathways)
    
    subset_vec = [x in pathways for x in rel_df.iloc[:,0]] and [x in pathways for x in rel_df.iloc[:,1]] 
    sub_rel_df = rel_df[subset_vec] 
    
    G = nx.DiGraph()
    
    G.add_nodes_from(pathways)
    G.add_edges_from(sub_rel_df.values)
    
    tree = nx.algorithms.dag.dag_to_branching(G)
    
    secondDict = nx.get_node_attributes(tree,'source')
    
    thirdDict = {'value':{}, 'ngenes':{}, 'max_val': {}}
    for key, value in secondDict.items():
        thirdDict['value'].update({key : topDict['value'][value]})
        thirdDict['ngenes'].update({key : topDict['ngenes'][value]})
        thirdDict['max_val'].update({key : topDict['max_val'][value]})

        
        
    nx.set_node_attributes(tree, thirdDict['value'], name = 'value')
    nx.set_node_attributes(tree, thirdDict['ngenes'], name = 'ngenes')
    nx.set_node_attributes(tree, thirdDict['max_val'], name = 'max_val')
    
    root = [v for v, d in tree.in_degree() if d == 0][0]
    out_json = json_graph.tree_data(tree, root)
    
    with open(outname, 'w') as outfile:
        json.dump(out_json, outfile, default=default)

sunburst(df_dict['Cluster 1 q-values'], outname = 'sunburst/intclust__1.json')

                                                        value  ngenes  \
Chromosome Maintenance                              28.690694     112   
RMTs methylate histone arginines                    27.805190      81   
Cellular Senescence                                 27.248127     221   
Nucleosome assembly                                 27.248127      75   
Deposition of new CENPA-containing nucleosomes ...  27.248127      75   
HDMs demethylate histones                           26.953585      50   
Senescence-Associated Secretory Phenotype (SASP)    26.833820     123   
Telomere Maintenance                                25.978580      85   
Estrogen-dependent gene expression                  25.929925     151   
Oxidative Stress Induced Senescence                 25.555362     135   
Meiosis                                             25.506739     127   
Reproduction                                        25.441606     157   
Base Excision Repair                               

In [85]:
def read_reactome(file_name, gene_name_start = "ENSG0"):
    df = pd.read_csv(file_name, sep='\t', header=None)
    
    if gene_name_start == None:
        sub_df = df
    else:
        subset_vec = df[0].str.startswith(gene_name_start)
        sub_df = df.loc[subset_vec]
    
    genes_df = sub_df.groupby(1)[0].apply(list)
    names_df = sub_df.groupby(1)[3].max()
    
    out_df = pd.concat([genes_df,names_df], axis=1)
    out_df.columns = ['genes', 'pathway_name']
    out_df.index = out_df.pathway_name
    
    return out_df

In [138]:
cluster_df = pd.read_csv("../data/pathway_qvalues.csv", index_col = 0)

clusterindex = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]
cluster_df = cluster_df.iloc[:,clusterindex]
reactome_ngenes = read_reactome("../data/Ensembl2Reactome_All_Levels.txt")

length_dict = {}
for i in cluster_df.index:
    nr_genes = len(reactome_ngenes.loc[i, "genes"])
    #print(nr_genes)
    length_dict[i] = nr_genes

cluster_df['ngenes'] = cluster_df.index.map(length_dict)


df_dict = {}

for i in cluster_df.iloc[:,:-1]:
    df = pd.DataFrame(index = cluster_df.index)
    df['value'] = cluster_df.loc[:,i]
    df['ngenes'] = cluster_df.loc[:,'ngenes']
    df['Organism'] = 'Human'
    print(df['value'].max
    df_dict[i] = df
    
for i in df_dict:
    clust = i.strip('Cluster q-va')
    sunburst(df_dict[i], outname = f'../exp/intclust_{clust}.json')
    
#sunburst(cluster_df, outname = 'pathways_per_intclusts.json')


SyntaxError: invalid syntax (<ipython-input-138-d6b68f1e301c>, line 24)