In [92]:
import warnings
import numpy as np
import pandas as pd
import random as rn
import requests
from mygene import MyGeneInfo
from scipy.stats import hypergeom
from statsmodels.stats import contingency_tables
from os.path import exists
import mygene
mg = mygene.MyGeneInfo()
import ddot
from ddot import Ontology

## Questions

1. Should I be restricting mouse data to only human orthologs?
2. Should I be restricting community data to only mouse orthologs?
3. Are we using the contingency table correctly?
4. Is it worth doing any of this?

In [290]:
def load_MGI_mouseKO_data(url='http://www.informatics.jax.org/downloads/reports/MGI_PhenoGenoMP.rpt',
                          map_using="mygeneinfo", update=False):
    """
    Function to parse and load mouse knockout data from MGI.

    :param url: location of MGI knockout data
    :type url: str
    :return: parsed MGI knockout dataframe, including column for human orthologs
    :rtype: :py:class:`pandas.DataFrame`
    """
    # download MGI phenotype data
    if (not exists('MGI_PhenoGenoMP.rpt')) or update:
        r = requests.get(url, allow_redirects=True)
        open('MGI_PhenoGenoMP.rpt', 'wb').write(r.content)

    # parse the downloaded MGI phenotype data
    mgi_df = pd.read_csv('MGI_PhenoGenoMP.rpt', sep='\t',
                         names=['MGI_Allele_Accession_ID',
                                'Allele symbol', 'involves',
                                'MP', 'PMID', 'MGI_marker_accession_ID'])
    if map_using == "mygeneinfo":
        # extract gene names
        gene_name = [a.split('<')[0] for a in mgi_df['Allele symbol'].tolist()]
        mgi_df['gene_name'] = gene_name
        mgi_df.index = mgi_df['gene_name']
        # map mouse genes to human orthologs
        mouse_genes = list(np.unique(mgi_df['gene_name']))
        mg_mapped = mg.querymany(mouse_genes,
                                 as_dataframe=True, species=['mouse', 'human'],
                                 scopes='symbol', fields='symbol')
        # drop genes with no human ortholog
        print(len(mg_mapped))
        mg_mapped = mg_mapped.dropna(subset=['symbol'])
        print(len(mg_mapped))
        # drop duplicates
        mg_mapped = mg_mapped[~mg_mapped.index.duplicated(keep='first')]
        print(len(mg_mapped))
        mgi_df['human_ortholog'] = mgi_df['gene_name'].map(dict(mg_mapped['symbol']))
        return mgi_df

    elif map_using == "mgi":
        if not exists('MRK_List2.rpt') or update:
            keep_url = "http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt"
            r_map = requests.get(keep_url, allow_redirects=True)
            open('MRK_List2.rpt', 'wb').write(r_map.content)
        keep = pd.read_csv('MRK_List2.rpt', sep="\t", usecols=["MGI Accession ID", "Marker Symbol",
                                                                     "Feature Type", "Marker Name"])
        keep = keep.loc[keep["Feature Type"].isin(["protein coding gene"])].reset_index(drop=True)
        mgi_df["MGI"] = mgi_df.MGI_marker_accession_ID.apply(lambda x: x.split("|"))
        mgi_df = mgi_df.explode("MGI", ignore_index=True)
        mgi_df["MGI"] = [mg if type(mg) is str else mg[0] for mg in mgi_df.MGI]
        mgi_df = mgi_df.loc[mgi_df["MGI"].isin(keep["MGI Accession ID"])]
        mgi_df = mgi_df.merge(keep.loc[:, ("MGI Accession ID", "Marker Symbol")], left_on="MGI",
                              right_on="MGI Accession ID", how="left")

        if not exists('HMD_HumanPhenotype.rpt') or update:
            map_url = "http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt"
            r_map = requests.get(map_url, allow_redirects=True)
            open('HMD_HumanPhenotype.rpt', 'wb').write(r_map.content)
        mapping = pd.read_csv('HMD_HumanPhenotype.rpt', sep="\t", header=None, usecols=[0, 2, 3],
                              index_col=False, names=["symbol", "gene_name", "MGI"])
        mapping = mapping.loc[mapping["MGI"].isin(keep["MGI Accession ID"])]

        mg_mapped = mgi_df.merge(mapping, on="MGI", how="left")
        mg_mapped.loc[mg_mapped.symbol.isna(), "gene_name"] = mg_mapped.loc[mg_mapped.symbol.isna(), "Marker Symbol"]
        mg_mapped = mg_mapped.drop_duplicates()
        mg_mapped.rename(columns={"symbol": 'human_ortholog'}, inplace=True)
        return mg_mapped
    elif map_using == "mgi2":
        mgi_df["MGI_Allele_Accession_ID"] = mgi_df["MGI_Allele_Accession_ID"].apply(lambda x: re.sub("\([\s\*\/a-zA-Z0-9,-]+\)", "", x))
        mgi_df["allele_count"] = mgi_df.MGI_Allele_Accession_ID.apply(lambda x: len(x.split(",")))
        mgi_df["MGI"] = mgi_df.MGI_marker_accession_ID.apply(lambda x: x.split("|"))
        map_df = mgi_df[mgi_df.allele_count==1]
        map_df = map_df.explode(column="MGI")
        #map_df = map_df.assign(MGI=map_df.MGI.apply(lambda x: x[0]))
        #return map_df
        if not exists('MRK_List2.rpt') or update:
            keep_url = "http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt"
            r_map = requests.get(keep_url, allow_redirects=True)
            open('MRK_List2.rpt', 'wb').write(r_map.content)
        keep = pd.read_csv('MRK_List2.rpt', sep="\t", usecols=["MGI Accession ID", "Marker Symbol",
                                                                     "Feature Type", "Marker Name"])
        map_df = map_df.merge(keep, left_on="MGI", right_on="MGI Accession ID")
        pc_df = map_df[map_df["Feature Type"]=="protein coding gene"]
        if not exists('HMD_HumanPhenotype.rpt') or update:
            map_url = "http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt"
            r_map = requests.get(map_url, allow_redirects=True)
            open('HMD_HumanPhenotype.rpt', 'wb').write(r_map.content)
        hs = pd.read_csv('HMD_HumanPhenotype.rpt', sep="\t", header=None, usecols=[0, 2, 3],
                              index_col=False, names=["symbol", "gene_name", "MGI"])
        pc_df = pc_df.merge(hs, on="MGI", how="left")
        
        pc_df = pc_df.loc[:, ['MGI_Allele_Accession_ID', 'Allele symbol', 'involves', 'MP', 'PMID', 'MGI',
                              'Marker Symbol', 'Marker Name', 'symbol'] ]
        
        pc_df.rename(columns={"symbol": 'human_ortholog', "Marker Symbol": "gene_name"}, inplace=True)

        return pc_df


def load_MPO(url='http://www.informatics.jax.org/downloads/reports/MPheno_OBO.ontology', use_genes=False,
             mapping=None, restrict_to=None, use_display=False):
    """
    Function to parse and load mouse phenotype ontology, using DDOT's ontology module

    :param url: URL containing MPO ontology file
    :type url: str
    :param use_genes:
    :type use_genes: bool
    :param mapping:
    :type mapping:
    :param restrict_to:
    :type restrict_to:
    :param use_display:
    :type use_display: bool
    :return: MPO parsed using DDOT
    :rtype: :py:class:`ddot.Ontology`
    :raises ImportError: If DDOT package is not found
    """
    if use_genes:
        assert mapping is not None, "You must supply a mapping dataframe if use_genes==True"
    # download the mammalian phenotype ontology, parse with ddot
    r = requests.get(url, allow_redirects=True)
    open('MPheno_OBO.ontology', 'wb').write(r.content)
    ddot.parse_obo('MPheno_OBO.ontology',
                   'parsed_mp.txt',
                   'id2name_mp.txt',
                   'id2namespace_mp.txt',
                   'altID_mp.txt')

    MP2desc = pd.read_csv('id2name_mp.txt', sep='\t', names=['MP', 'description'], index_col='MP')

    MP2desc = MP2desc.loc[MP2desc.index.dropna()]  # drop NAN from index
    print(len(MP2desc))

    hierarchy = pd.read_table('parsed_mp.txt',
                              sep='\t',
                              header=None,
                              names=['Parent', 'Child', 'Relation', 'Namespace'])

    if use_display:
        from IPython.core.display import display
        display(MP2desc.head())
        display(hierarchy.head())

    if use_genes:  # map genes from `mapping` to terms in the ontology
        mouse_mapping = mapping.dropna().loc[:, ("human_ortholog", "MP")].reset_index()
        mouse_mapping = mouse_mapping.loc[:, ("human_ortholog", "MP")]
        mouse_mapping.columns = ["Gene", "Term"]
        if restrict_to is not None:  # restrict to specified subset of genes
            mouse_mapping = mouse_mapping.loc[mouse_mapping.Gene.isin(restrict_to)]
        # generate ontology using gene mapping
        MPO = Ontology.from_table(
            table=hierarchy,
            parent='Parent',
            child='Child',
            add_root_name='MP:00SUPER',
            ignore_orphan_terms=True,
            mapping=mouse_mapping,
            mapping_parent='Term',
            mapping_child='Gene')
    else:  # create the ontology without gene mappings
        MPO = Ontology.from_table(
            table=hierarchy,
            parent='Parent',
            child='Child',
            add_root_name='MP:00SUPER',
            ignore_orphan_terms=True)

    # add description to node attribute
    terms_keep = list(np.unique(hierarchy['Parent'].tolist()+hierarchy['Child'].tolist()))
    MPO.node_attr = MP2desc.loc[terms_keep]

    return MPO



In [291]:
mgi_df = load_MGI_mouseKO_data(map_using="mgi2", update=False)
mgi_df

Unnamed: 0,MGI_Allele_Accession_ID,Allele symbol,involves,MP,PMID,MGI,gene_name,Marker Name,human_ortholog
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,Rb1,RB transcriptional corepressor 1,RB1
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,Rb1,RB transcriptional corepressor 1,RB1
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,Rb1,RB transcriptional corepressor 1,RB1
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,Rb1,RB transcriptional corepressor 1,RB1
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,Rb1,RB transcriptional corepressor 1,RB1


In [295]:
MPO3 = load_MPO(use_genes=True, mapping=mgi_df)

14555


In [302]:
[MPO3.genes[x] for x in MPO3.term_2_gene["MP:0011940"]]

['ABHD6',
 'ACBD5',
 'ACOT13',
 'ADIPOQ',
 'AGRP',
 'AKR1B10',
 'AKR1B15',
 'ALK',
 'ALKBH7',
 'ARID5B',
 'ATN1',
 'C1QTNF5',
 'CALCA',
 'CAMKK2',
 'CDKAL1',
 'CHRM3',
 'CIDEB',
 'CISD2',
 'CLDN12',
 'CPT1C',
 'CTF1',
 'CTSD',
 'CXCL14',
 'CYP11A1',
 'DLL1',
 'DMBX1',
 'DRD2',
 'ENHO',
 'ENPP1',
 'FADS2',
 'FBN1',
 'FLNC',
 'FOXO6',
 'FTO',
 'GCGR',
 'GHSR',
 'GLP1R',
 'GPR82',
 'HCN2',
 'HDAC4',
 'HILPDA',
 'HNF4G',
 'HRH1',
 'HTR2C',
 'HTR6',
 'IMPACT',
 'KCTD1',
 'KRT6A',
 'KRT6B',
 'LEP',
 'LYPD3',
 'MAF1',
 'MAPK8',
 'MAPK8IP1',
 'MFRP',
 'MLXIPL',
 'MYH1',
 'NDUFA1',
 'NEGR1',
 'NMUR2',
 'NPC1',
 'NPFFR2',
 'NPY1R',
 'NPY5R',
 'NPY6R',
 'OPN3',
 'OR4M1',
 'OR4M2',
 'PDE10A',
 'PHOSPHO1',
 'PLIN2',
 'PMCH',
 'PMCHL2',
 'PPARG',
 'PRCP',
 'PRKAB1',
 'PRLHR',
 'PTGER1',
 'QRFP',
 'RAPGEF3',
 'RCAN2',
 'RMI1',
 'SEMA3A',
 'SIK2',
 'SLC16A1',
 'SLC27A5',
 'SLC30A7',
 'SLC52A3',
 'SLC5A1',
 'SLN',
 'SNAP25',
 'TAFA1',
 'THRA',
 'TNN',
 'TRIB2',
 'TRIM37',
 'UCN2',
 'ULK4',
 'VIPR2',
 '

## Original

In [3]:
mgi_orig = load_MGI_mouseKO_data(update=True)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-24140...done.
Finished.
38 input query terms found dup hits:
	[('Abo', 3), ('Adam1a', 2), ('Adam1b', 2), ('Adam5', 2), ('Casp12', 2), ('Ccdc39', 2), ('Chaserr', 2
12251 input query terms found no hit:
	['+|Cav1', '+|Cm', '+|Del(10Prmt2-Pdxk)4Yey', '+|Del(11Mpo-Chad)18Brd', '+|Del(11Stat5a-Stat5b)1Mam'
Pass "re

In [96]:
MPO_orig=load_MPO(use_genes=True, mapping=mgi_orig)

14555


9948

In [296]:
nodes = nx.dfs_preorder_nodes(mp_graph, "MP:0002069")
nodes = [n for n in nodes]
orig_genes = []
new_genes = []
for node in nodes:
    print(node)
    orig_genes = set([MPO_orig.genes[n] for n in MPO_orig.term_2_gene[node]])
    new_genes = set([MPO3.genes[n] for n in MPO3.term_2_gene[node]])
    print(orig_genes.intersection(new_genes))
    print(orig_genes.union(new_genes))
    print("Original ", orig_genes.difference(new_genes))
    print("new ", new_genes.difference(orig_genes))
    print("\n")

MP:0002069
set()
set()
Original  set()
new  set()


MP:0001422
{'NEUROD4', 'SCT', 'UCN2', 'PTGS1', 'OXT', 'SCTR', 'TRPV4', 'PTGS2'}
{'NEUROD4', 'SCT', 'UCN2', 'PTGS1', 'OXT', 'SCTR', 'TRPV4', 'PTGS2'}
Original  set()
new  set()


MP:0001423
{'ITPR3', 'NOS1', 'POU2F3', 'SLC18A2', 'DISC1', 'ACADS', 'ADORA2A'}
{'ITPR3', 'NOS1', 'POU2F3', 'SLC18A2', 'DISC1', 'BDNF', 'EHMT1', 'ACADS', 'ADORA2A'}
Original  {'BDNF', 'EHMT1'}
new  set()


MP:0001987
{'SLC29A1', 'FAAH', 'CCL2', 'SLC6A1', 'EPS8', 'UNC79', 'CRHR1', 'TRPV1', 'GPR26', 'GABRB1', 'PRKCG'}
{'SLC29A1', 'FAAH', 'SLC6A1', 'LINC02210-CRHR1', 'EPS8', 'TRPV1', 'GPR26', 'CCL2', 'CCL8', 'UNC79', 'CRHR1', 'GABRB1', 'PRKCG'}
Original  set()
new  {'LINC02210-CRHR1', 'CCL8'}


MP:0002119
set()
set()
Original  set()
new  set()


MP:0002570
{'CCL2', 'CCR2', 'CACNA1B', 'DRD2', 'NPY2R', 'CCL3', 'MPDZ'}
{'CCL3L1', 'CCR2', 'CCL3L3', 'MPDZ', 'CCL18', 'CCL2', 'CCL8', 'DRD2', 'NPY2R', 'CCL3', 'CACNA1B'}
Original  set()
new  {'CCL18', 'CCL3L1', 'CCL3L3', '

## From GenoPheno

In [4]:
mgi_new = load_MGI_mouseKO_data(map_using="mgi", update=True)

In [97]:
MPO_new=load_MPO(use_genes=True, mapping=mgi_new)

14555


In [5]:
mgi_new.head()

Unnamed: 0,MGI_Allele_Accession_ID,Allele symbol,involves,MP,PMID,MGI_marker_accession_ID,MGI,MGI Accession ID,Marker Symbol,human_ortholog,gene_name
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:97874,MGI:97874,Rb1,RB1,Rb1
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:97874,MGI:97874,Rb1,RB1,Rb1
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:97874,MGI:97874,Rb1,RB1,Rb1
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:97874,MGI:97874,Rb1,RB1,Rb1
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:97874,MGI:97874,Rb1,RB1,Rb1


In [99]:
import networkx as nx
def _get_mp_graph(datafile="parsed_mp.txt"):
    mp_data = pd.read_csv(datafile, sep="\t", header=None)
    mp_data.head()
    mp_graph = nx.from_pandas_edgelist(mp_data, 0,1, create_using=nx.DiGraph)
    return mp_graph

mp_graph = _get_mp_graph()

In [115]:
nodes = nx.dfs_preorder_nodes(mp_graph, "MP:0002069")
nodes = [n for n in nodes]


In [114]:
orig_genes = []
new_genes = []
for node in nodes:
    print(node)
    orig_genes = set([MPO_orig.genes[n] for n in MPO_orig.term_2_gene[node]])
    new_genes = set([MPO_new.genes[n] for n in MPO_new.term_2_gene[node]])
    print(orig_genes.intersection(new_genes))
    print(orig_genes.union(new_genes))
    print("Original ", orig_genes.difference(new_genes))
    print("new ", new_genes.difference(orig_genes))
    print("\n")

MP:0002069
set()
set()
Original  set()
new  set()


MP:0001422
{'NEUROD4', 'SCT', 'UCN2', 'PTGS1', 'OXT', 'SCTR', 'TRPV4', 'PTGS2'}
{'NEUROD4', 'SCT', 'UCN2', 'PTGS1', 'OXT', 'SCTR', 'TRPV4', 'PTGS2'}
Original  set()
new  set()


MP:0001423
{'ITPR3', 'NOS1', 'POU2F3', 'SLC18A2', 'DISC1', 'BDNF', 'EHMT1', 'ACADS', 'ADORA2A'}
{'NOS1', 'EHMT2', 'ACADS', 'ADORA2A', 'ITPR3', 'POU2F3', 'SLC18A2', 'GRIN1', 'DISC1', 'BDNF', 'EHMT1'}
Original  set()
new  {'EHMT2', 'GRIN1'}


MP:0001987
{'SLC29A1', 'FAAH', 'CCL2', 'SLC6A1', 'EPS8', 'UNC79', 'CRHR1', 'TRPV1', 'GPR26', 'GABRB1', 'PRKCG'}
{'SLC29A1', 'FAAH', 'CCR2', 'SLC6A1', 'LINC02210-CRHR1', 'EPS8', 'TRPV1', 'GPR26', 'CCL2', 'CCL8', 'UNC79', 'CRHR1', 'GABRB1', 'PRKCG'}
Original  set()
new  {'CCR2', 'LINC02210-CRHR1', 'CCL8'}


MP:0002119
set()
set()
Original  set()
new  set()


MP:0002570
{'CCL2', 'CCR2', 'CACNA1B', 'DRD2', 'NPY2R', 'CCL3', 'MPDZ'}
{'CCL3L1', 'CCR2', 'CCL3L3', 'MPDZ', 'CCL18', 'CCL2', 'CCL8', 'DRD2', 'NPY2R', 'CCL3', 'CACNA1B'}


In [110]:
new_genes = []
for node in nodes:
    new_genes += MPO_new.term_2_gene[node]
len(set(new_genes))

776

In [86]:
mps = mgi_orig.MP.unique()
for _ in range(10):
    idx = rn.randint(0, len(mps))
    test = mps[idx]
    print(test)
    print("Original", mgi_orig.loc[mgi_orig.MP==test, "human_ortholog"].unique())
    print("New", mgi_new.loc[mgi_new.MP==test, "human_ortholog"].unique())

MP:0004813
Original ['POU3F4' 'SLC30A4' 'NOX3' 'CYBA' 'AP3D1' 'ATP2B2' 'CDH23' 'SOBP' 'PCDH15'
 'SPTBN4' 'USH1G' 'KCNE1' 'ESPN' 'MYO7A' 'TMIE' 'MYO6' 'HR' 'MBP' 'OTOP1'
 'KCNA10' nan]
New ['POU3F4' 'SLC30A4' 'NOX3' 'CYBA' 'AP3D1' 'ATP2B2' 'CDH23' 'SOBP' 'PCDH15'
 'SPTBN4' 'USH1G' 'KCNE1' 'ESPN' 'MYO7A' 'TMIE' 'MYO6' 'HR' 'MBP' 'OTOP1'
 'KCNA10' 'CAPZB']
MP:0005175
Original ['KIT' 'MITF' 'MCOLN3' 'PFAS' nan 'DOCK7' 'AEBP2' 'SOX10' 'RACK1']
New ['KIT' 'MITF' 'MCOLN3' 'PFAS' 'DOCK7' 'KITLG' 'AEBP2' 'SOX10' 'RACK1']
MP:0003905
Original ['FBN1']
New ['FBN1']
MP:0002819
Original ['DSPP' 'DMP1' 'COL1A1' 'MAP3K11' 'FAM20C' nan]
New ['DSPP' 'DMP1' 'COL1A1' 'MAP3K11' 'FAM20C' 'SSUH2']
MP:0013820
Original ['CHD7' 'FOXG1' 'GAS2L2']
New ['CHD7' 'FOXG1' 'GAS2L2']
MP:0005659
Original ['CREBBP' 'PTPN1' 'UCP1' 'HMGA2' 'TRPV1' 'CAV1' 'LIPE' 'ESRRA' 'PRKAR2B'
 'KCNA3' 'FABP2' 'CNR1' 'PPARGC1A' 'SDC3' 'MCHR1' 'INPPL1' nan 'NPC1L1'
 'FOXS1' 'GHSR' 'MAPK8' 'KCNJ11' 'TNF' 'DGAT1' 'CACNA1B' 'RANBP2' 'CBL'
 'A

In [89]:
len(mgi_orig)

352807

## Other

In [20]:
alleles = pd.read_csv("MGI_PhenotypicAllele.rpt", sep="\t", comment="#", header=None, names=["MGI Allele Accession ID", "Allele Symbol",
                                                                                             "Allele Name", "Allele Type","Allele Attribute","PubMed ID for original reference",
                                                                                             "MGI Marker Accession ID","Marker Symbol","Marker RefSeq ID",
                                                                                             "Marker Ensembl ID","High-level Mammalian Phenotype ID (comma-delimited)",
                                                                                             "Synonyms (|-delimited)","Marker Name"])

In [24]:
alleles.loc[5000:5002]

Unnamed: 0,MGI Allele Accession ID,Allele Symbol,Allele Name,Allele Type,Allele Attribute,PubMed ID for original reference,MGI Marker Accession ID,Marker Symbol,Marker RefSeq ID,Marker Ensembl ID,High-level Mammalian Phenotype ID (comma-delimited),Synonyms (|-delimited),Marker Name
5000,MGI:5554915,Bbs4<tm1b(EUCOMM)Hmgu>,"targeted mutation 1b, Helmholtz Zentrum Muench...",Targeted,Reporter|Null/knockout,,MGI:2143311,Bbs4,XM_036154544,ENSMUSG00000025235,"MP:0005386,MP:0010768",,Bardet-Biedl syndrome 4 (human)
5001,MGI:6509074,Bbs4<tm1c(EUCOMM)Hmgu>,"targeted mutation 1c, Helmholtz Zentrum Muench...",Targeted,Conditional ready,33426789.0,MGI:2143311,Bbs4,XM_036154544,ENSMUSG00000025235,,Bbs4<FL>,Bardet-Biedl syndrome 4 (human)
5002,MGI:6509075,Bbs4<tm1d(EUCOMM)Hmgu>,"targeted mutation 1d, Helmholtz Zentrum Muench...",Targeted,Null/knockout,33426789.0,MGI:2143311,Bbs4,XM_036154544,ENSMUSG00000025235,,Bbss4<KO>,Bardet-Biedl syndrome 4 (human)


In [28]:
"Bbs4<tm1b(EUCOMM)Hmgu>" in mgi_orig["MGI_Allele_Accession_ID"]

False

In [26]:
mgi_new[mgi_new.gene_name=="Bbs4"]

Unnamed: 0,MGI_Allele_Accession_ID,Allele symbol,involves,MP,PMID,MGI_marker_accession_ID,MGI,MGI Accession ID,Marker Symbol,human_ortholog,gene_name
47604,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001433,15173597,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
47605,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001262,15173597,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
47606,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001261,15173597,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
47607,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001925,15173597,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
47608,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001326,15173597,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
...,...,...,...,...,...,...,...,...,...,...,...
337470,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,B6.129S7-Bbs4<Gt1Nk>,MP:0009454,31479441,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
337471,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,B6.129S7-Bbs4<Gt1Nk>,MP:0009456,31479441,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
337472,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,B6.129S7-Bbs4<Gt1Nk>,MP:0020508,31479441,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4
337473,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,B6.129S7-Bbs4<Gt1Nk>,MP:0020511,31479441,MGI:2143311,MGI:2143311,MGI:2143311,Bbs4,BBS4,Bbs4


In [29]:
zz = pd.read_csv('MGI_GenePheno.rpt', sep="\t", names=["Allelic Composition","Allele Symbol(s)","Allele ID(s)",
                                                       "Genetic Background","Mammalian Phenotype ID","PubMed ID (pipe-delimited)","MGI Marker Accession ID (pipe-delimited)","MGI Genotype Accession ID (pipe-delimited)"])

In [30]:
zz.head()

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID (pipe-delimited),MGI Marker Accession ID (pipe-delimited),MGI Genotype Accession ID (pipe-delimited)
0,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000600,12529408,MGI:97874,MGI:2166359
1,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001716,16449662,MGI:97874,MGI:2166359
2,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001698,16449662,MGI:97874,MGI:2166359
3,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0001092,16449662,MGI:97874,MGI:2166359
4,Rb1<tm1Tyj>/Rb1<tm1Tyj>,Rb1<tm1Tyj>,MGI:1857242,involves: 129S2/SvPas,MP:0000961,16449662,MGI:97874,MGI:2166359


In [31]:
zz[zz["Allele ID(s)"] == "MGI:5554915"]

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID (pipe-delimited),MGI Marker Accession ID (pipe-delimited),MGI Genotype Accession ID (pipe-delimited)
195340,Bbs4<tm1b(EUCOMM)Hmgu>/Bbs4<+>,Bbs4<+>|Bbs4<tm1b(EUCOMM)Hmgu>,MGI:5554915,B6N(Cg)-Bbs4<tm1b(EUCOMM)Hmgu>/J,MP:0001417,,MGI:2143311,MGI:5763838
195341,Bbs4<tm1b(EUCOMM)Hmgu>/Bbs4<+>,Bbs4<+>|Bbs4<tm1b(EUCOMM)Hmgu>,MGI:5554915,B6N(Cg)-Bbs4<tm1b(EUCOMM)Hmgu>/J,MP:0020870,,MGI:2143311,MGI:5763838
195342,Bbs4<tm1b(EUCOMM)Hmgu>/Bbs4<tm1b(EUCOMM)Hmgu>,Bbs4<tm1b(EUCOMM)Hmgu>,MGI:5554915,B6N(Cg)-Bbs4<tm1b(EUCOMM)Hmgu>/J,MP:0011110,,MGI:2143311,MGI:5763839


In [32]:
zz[zz["MGI Marker Accession ID (pipe-delimited)"]=="MGI:2143311"]

Unnamed: 0,Allelic Composition,Allele Symbol(s),Allele ID(s),Genetic Background,Mammalian Phenotype ID,PubMed ID (pipe-delimited),MGI Marker Accession ID (pipe-delimited),MGI Genotype Accession ID (pipe-delimited)
40066,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,MGI:3045473,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001433,15173597,MGI:2143311,MGI:3045547
40067,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,MGI:3045473,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001262,15173597,MGI:2143311,MGI:3045547
40068,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,MGI:3045473,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001261,15173597,MGI:2143311,MGI:3045547
40069,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,MGI:3045473,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001925,15173597,MGI:2143311,MGI:3045547
40070,Bbs4<tm1Vcs>/Bbs4<tm1Vcs>,Bbs4<tm1Vcs>,MGI:3045473,involves: 129S1/Sv * 129X1/SvJ * C57BL/6J,MP:0001326,15173597,MGI:2143311,MGI:3045547
...,...,...,...,...,...,...,...,...
229917,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,MGI:3055492,B6.129S7-Bbs4<Gt1Nk>,MP:0009454,31479441,MGI:2143311,MGI:6370045
229918,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,MGI:3055492,B6.129S7-Bbs4<Gt1Nk>,MP:0009456,31479441,MGI:2143311,MGI:6370045
229919,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,MGI:3055492,B6.129S7-Bbs4<Gt1Nk>,MP:0020508,31479441,MGI:2143311,MGI:6370045
229920,Bbs4<Gt1Nk>/Bbs4<Gt1Nk>,Bbs4<Gt1Nk>,MGI:3055492,B6.129S7-Bbs4<Gt1Nk>,MP:0020511,31479441,MGI:2143311,MGI:6370045


In [39]:
new_map = pd.read_csv('MGI_GenePheno.rpt', sep="\t", usecols=[2, 4, 6],
                  names=["Allele_ID","MP","MGI"])
new_map.head()

Unnamed: 0,Allele_ID,MP,MGI
0,MGI:1857242,MP:0000600,MGI:97874
1,MGI:1857242,MP:0001716,MGI:97874
2,MGI:1857242,MP:0001698,MGI:97874
3,MGI:1857242,MP:0001092,MGI:97874
4,MGI:1857242,MP:0000961,MGI:97874


In [40]:
new_map.loc["MGI"] = new_map.MGI.apply(lambda x: x.split("|"))
new_map = new_map.explode("MGI")

Unnamed: 0,Allele_ID,MP,MGI
0,MGI:1857242,MP:0000600,MGI:97874
1,MGI:1857242,MP:0001716,MGI:97874
2,MGI:1857242,MP:0001698,MGI:97874
3,MGI:1857242,MP:0001092,MGI:97874
4,MGI:1857242,MP:0000961,MGI:97874


In [61]:
xx = pd.read_csv("HMD_HumanPhenotype.rpt", sep="\t", usecols=[0,2,3], names=["symbol", "gene_name", "MGI"])
xx.head()


Unnamed: 0,symbol,gene_name,MGI
0,A1BG,A1bg,MGI:2152878
1,A1CF,A1cf,MGI:1917115
2,A2M,A2m,MGI:2449119
3,A2ML1,Mug1,MGI:99837
4,A2ML1,Mug2,MGI:99836


In [62]:
new_map.head()

Unnamed: 0,Allele_ID,MP,MGI
0,MGI:1857242,MP:0000600,MGI:97874
1,MGI:1857242,MP:0001716,MGI:97874
2,MGI:1857242,MP:0001698,MGI:97874
3,MGI:1857242,MP:0001092,MGI:97874
4,MGI:1857242,MP:0000961,MGI:97874


In [65]:
mapping = new_map.merge(xx, on="MGI", how="outer")

In [72]:
mapping

Unnamed: 0,Allele_ID,MP,MGI,symbol,gene_name
0,MGI:1857242,MP:0000600,MGI:97874,RB1,Rb1
1,MGI:1857242,MP:0001716,MGI:97874,RB1,Rb1
2,MGI:1857242,MP:0001698,MGI:97874,RB1,Rb1
3,MGI:1857242,MP:0001092,MGI:97874,RB1,Rb1
4,MGI:1857242,MP:0000961,MGI:97874,RB1,Rb1
...,...,...,...,...,...
270189,,,MGI:3694898,ZXDA,Zxdb
270190,,,MGI:3694898,ZXDB,Zxdb
270191,,,MGI:1933108,ZXDC,Zxdc
270192,,,MGI:2446208,ZYG11A,Zyg11a


In [74]:
mps = mgi_orig.MP.unique()
for _ in range(10):
    idx = rn.randint(0, len(mps))
    test = mps[idx]
    print(test)
    print("Original", mgi_orig.loc[mgi_orig.MP==test, "human_ortholog"].unique())
    print("New", mapping.loc[mapping.MP==test, "symbol"].unique())

MP:0000073
Original ['CTNNB1' 'PTPN11']
New []
MP:0003903
Original ['GABPA' 'SEPTIN7']
New []
MP:0010015
Original ['POU4F3' 'CAPZB' 'MYO6']
New ['MYO6' 'POU4F3']
MP:0008070
Original ['FOXN1' nan 'CBFB' 'IKZF1' 'ITPKB' 'ZAP70' 'XRCC5' 'TCF12' 'CD3E' 'KLF2'
 'RAG2' 'PRKDC' 'PRF1' 'IL2RG' 'BCL11B' 'LAT' 'PTPRC' 'FYN' 'RAG1' 'ZBTB1'
 'DCLRE1C' 'COL7A1']
New ['FOXN1' 'LAT' 'RAG1' nan 'IKZF1' 'CD3E' 'PRKDC' 'BCL11B' 'DCLRE1C'
 'ITPKB' 'PTPRC' 'COL7A1' 'TCF12' 'ZAP70' 'XRCC5']
MP:0014168
Original ['DOCK7']
New ['DOCK7']
MP:0009325
Original [nan 'BRWD1' 'CSF1' 'PFDN5' 'BEST1' 'VPS13B' 'KCNG4']
New ['CD59' 'QKI' 'BRWD1' 'EPHB6' 'CSF1' 'BEST1' 'PFDN5' 'TRPV6' 'KCNG4'
 'VPS13B']
MP:0008365
Original ['NPC1' 'SLC6A3' 'EGR1' 'PROP1' 'PITX2' 'POU1F1' 'LIN9' 'CDK4' 'LHX3']
New ['PITX2' 'CDK4' 'NPC1' 'SLC6A3' 'EGR1' 'PROP1' 'LHX3' 'POU1F1']
MP:0010919
Original ['HES1' 'FBN1' 'NEUROD1']
New ['HES1' 'NEUROD1' 'FBN1']
MP:0030016
Original ['PIK3R1' 'PRKCA' 'NPY1R' nan 'TLR2' 'SLC2A4' 'FABP5' 'LEP' 'PPARG'


In [75]:
xx[xx.MGI=="MGI:88276"]

Unnamed: 0,symbol,gene_name,MGI
4181,CTNNB1,Ctnnb1,MGI:88276


In [81]:
new_map[new_map.MGI=="MGI:95610"].sort_values(by="MP").MP.values

array(['MP:0000438', 'MP:0000706', 'MP:0000715', 'MP:0001293',
       'MP:0002102', 'MP:0003743', 'MP:0004200', 'MP:0011091',
       'MP:0011092', 'MP:0011099', 'MP:0011100'], dtype=object)

In [84]:
mgi_new[mgi_new["MP"]=="MP:0003903"]

Unnamed: 0,MGI_Allele_Accession_ID,Allele symbol,involves,MP,PMID,MGI_marker_accession_ID,MGI,MGI Accession ID,Marker Symbol,human_ortholog,gene_name
124231,Gabpa<tm1Agro>/Gabpa<tm1Agro>,Gabpa<tm1Agro>,involves: 129,MP:0003903,17277770,MGI:95610,MGI:95610,MGI:95610,Gabpa,GABPA,Gabpa
275030,Septin7<tm1Mgl>/Septin7<tm1Mgl>,Septin7<tm1Mgl>,involves: 129P2/OlaHsd * C3H * C57BL/6,MP:0003903,25122120,MGI:1335094,MGI:1335094,MGI:1335094,Septin7,SEPTIN7,Septin7
