In [9]:
from netcoloc import validation
import pandas as pd

In [5]:
def _get_mp_graph(datafile="parsed_mp.txt"):
    mp_data = pd.read_csv(datafile, sep="\t", header=None)
    mp_data.head()
    mp_graph = nx.from_pandas_edgelist(mp_data, 0,1, create_using=nx.DiGraph)
    return mp_graph

def genes_per_node(MPO):
    node_order = MPO.topological_sorting(top_down=False)
    nodes = [i for i in node_order]
    results = {i: set(MPO.term_2_gene[i]) for i in node_order}
    genes = {i: set(MPO.gene_2_term[i]) for i in MPO.genes}
    while len(nodes) > 0:
        current = nodes.pop()
        children = MPO.parent_2_child[current]
        if len(children) > 0:
            for child in children:
                if child != current:
                    results[current] = results[current].union(results[child])
        for gene in results[current]:
            if gene not in genes.keys():
                genes[gene] = set([current])
            else:
                genes[gene] = genes[gene].union(set([current]))
        else:
            pass
    counts = {k: len(results[k]) for k in results.keys()}
    return counts, genes, results

## Import MPO data

In [2]:
# load mappings between genes and mammalian phenotypes
mgi_df = validation.load_MGI_mouseKO_data(url='http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-24184...done.
Finished.
38 input query terms found dup hits:
	[('Abo', 3), ('Adam1a', 2), ('Adam1b', 2), ('Adam5', 2), ('Casp12', 2), ('Ccdc39', 2), ('Chaserr', 2
12279 input query terms found no hit:
	['+|Cav1', '+|Cm', '+|Del(10Prmt2-Pdxk)4Yey', '+|Del(11Mpo-Chad)18Brd', '+|Del(11Stat5a-Stat5b)1Mam'
Pass "re

In [3]:
MPO = validation.load_MPO(url='http://www.informatics.jax.org/downloads/reports/MPheno_OBO.ontology', use_genes=True, mapping=mgi_df)

14598


In [7]:
term_counts, gene_mapping, term_mapping = genes_per_node(MPO)

## Import community data

In [15]:
hier_df_genes = pd.read_csv("hier_df_genes.tsv", sep="\t", index_col=0)
hier_df_genes.head()

Unnamed: 0,CD_MemberList,frac_d1_seeds,frac_d2_seeds
C877,HELZ EMX2 HIST1H3A RAD51 SYT12 GYPC KDM3B ERC1...,0.353583,0.17757
C898,STX4 LSAMP SPOCK1 SGCB STXBP5 TRP-AGG2-6 GPRC5...,0.363636,0.181818
C904,BCL11B PKIB PKIA LMO1 NR2F1 LDB2 ZFPM2 GATA3 A...,0.3,0.2
C906,PROM2 SPINT1 TNIK PKP3 SSH3 FEZ2 CORIN WWC1 CAPN1,0.333333,0.111111
C911,ZNF629 ZKSCAN5 LINGO1 ZNF202 ZKSCAN2 ZNF263 ZN...,0.571429,0.142857


In [107]:
community_genes = {comm: list(set(hier_df_genes.loc[comm, "CD_MemberList"].split(" "))) for comm in hier_df_genes.index}

## Get lists of MP terms

In [142]:
def get_gene_term_map(communities, terms, MPO, community_names=None):
    results = pd.DataFrame({"genes":[]})
    for comm in communities:
        results = results.merge(pd.DataFrame({comm:1, "genes":community_genes[comm]}), on=["genes"], how="outer")
    for term in terms:
       
        col_name = get_MP_description(term, MPO) + " ("+ term + ")"
        results = results.merge(pd.DataFrame({col_name:1, 'genes':list(set([MPO.genes[i] for i in term_mapping[term]]))}), on= "genes", how='left')
    results = results.fillna(0)
    results.set_index("genes", inplace=True)
    results.index.name=None
    if community_names is not None:
        new_names = {communities[i]:community_names[i] + " ("+communities[i]+")" for i in range(len(communities))}
        results.rename(columns=new_names, inplace=True)
    return results.astype(int)

def get_MP_description(term, MPO):
    return MPO.node_attr.loc[term].description
    

In [95]:
prenatal_terms = ['MP:0010866', 'MP:0001697', 'MP:0005076', 'MP:0013202', 'MP:0012555', 'MP:0010770', 'MP:0008762', 
                  'MP:0003861', 'MP:0002152', 'MP:0004811', 'MP:0001614', 'MP:0002925', 'MP:0002019', 'MP:0002085', 
                  'MP:0001672']
postnatal_terms = ['MP:0001259', 'MP:0001265', 'MP:0001264', 'MP:0001256', 'MP:0012321', 'MP:0005164', 'MP:0009642', 
                   'MP:0003953', 'MP:0002078', 'MP:0013561', 'MP:0013560', 'MP:0013558', 'MP:0002069', 'MP:0014114', 
                   'MP:0002067', 'MP:0008946', 'MP:0002723', 'MP:0005330', 'MP:0005620', 'MP:0002269', 'MP:0004087', 
                   'MP:0000759', 'MP:0004215', 'MP:0002972', 'MP:0010630', 'MP:0000266']

In [149]:
postnatal = get_gene_term_map(['C907','C894'], postnatal_terms, MPO, ["MSC", "GSR"])
prenatal = get_gene_term_map(['C882', 'C889', 'C909'],prenatal_terms, MPO, ["DM1", "PP", "RPA"])
combined = get_gene_term_map(['C907','C894','C882', 'C889', 'C909' ], postnatal_terms+prenatal_terms, MPO, 
                             ["MSC", "GSR", "DM1", "PP", "RPA"])

In [147]:
postnatal.to_csv("~/Data/Transfer/rat/postnatal_gene_term_map_C907_C894.tsv", sep="\t")

In [148]:
prenatal.to_csv("~/Data/Transfer/rat/prenatal_gene_term_map_C882_C889_C909.tsv", sep="\t")

In [150]:
combined.to_csv("~/Data/Transfer/rat/combined_gene_term_map_C907_C894_C882_C889_C909.tsv", sep="\t")