# Dig it...

In [26]:
# code
import pandas
import pymysql
import requests

db = pymysql.connect("localhost","mygene","","ensembl2entrez")
cursor = db.cursor()

def is_ncbi_id(value):
    # search in list of ID or mapping from the other side
    cursor.execute("""
    select ncbi
    from entrez
    where ncbi = %s
    union
    select ncbi from ens2ent where ens2ent.ncbi = %s
    limit 1
    """,(value,value))
    res = cursor.fetchone()
    return res and True or False

def is_ensembl_id(value):
    cursor.execute("""
    select ensembl
    from ensembl
    where ensembl = %s
    union
    select ensembl from ent2ens where ent2ens.ensembl = %s
    limit 1
    """,(value,value))
    res = cursor.fetchone()
    return res and True or False

def get_outputtype_from_inputtype_according_to_provider(inputtype, outputtype, provider, value):
    providers = {"ensembl" : "ens2ent", "entrez" : "ent2ens"}
    assert inputtype in ("ensembl","ncbi","symbol"), inputtype
    assert outputtype in ("ensembl","ncbi","symbol"), outputtype
    sql = """
    select {outputtype}
    from {provider}
    where {provider}.{inputtype} = %s
    """.format(**{
        "outputtype" : outputtype,
        "inputtype" : inputtype,
        "provider" : providers[provider]
    })
    cursor.execute(sql,(value,))
    res = list(cursor.fetchall())
    if res:
        return [str(r[0]) for r in res]
    else:
        return []
    

def query_mygene(value, inputfield, outputfield, url="http://mygene.info/v3"): 
    dat = requests.get(url + "/query?q=%s:%s&fields=entrezgene,ensembl.gene&dotfield=1" % (inputfield,value)) 
    res = dat.json() 
    l = []
    if res["hits"]:
        for hit in res["hits"]:
            if not hit.get(outputfield):
                continue
            if type(hit.get(outputfield)) == list:
                [l.append(elem) for elem in hit[outputfield]]
            else:
                l.append(hit[outputfield])
    return l
        
def get_ensembl_from_ncbi_according_to_ncbi(value):
    return get_outputtype_from_inputtype_according_to_provider("ncbi","ensembl","entrez",value)
def get_ensembl_from_ncbi_according_to_ensembl(value):
    return get_outputtype_from_inputtype_according_to_provider("ncbi","ensembl","ensembl",value)
def get_ncbi_from_ensembl_according_to_ncbi(value):
    return get_outputtype_from_inputtype_according_to_provider("ensembl","ncbi","entrez",value)
def get_ncbi_from_ensembl_according_to_ensembl(value):
    return get_outputtype_from_inputtype_according_to_provider("ensembl","ncbi","ensembl",value)
def get_ncbi_from_ensembl_according_to_mygene(value):
    return query_mygene(value,"ensembl.gene","entrezgene")
def get_ensembl_from_ncbi_according_to_mygene(value):
    return query_mygene(value,"entrezgene","ensembl.gene")

# test...
enn = get_ensembl_from_ncbi_according_to_ncbi("1017")
ene = get_ensembl_from_ncbi_according_to_ensembl("1017")
assert enn == enn == ["ENSG00000123374"]
nen = get_ncbi_from_ensembl_according_to_ncbi("ENSG00000123374")
nee = get_ncbi_from_ensembl_according_to_ensembl("ENSG00000123374")
assert nen == nee == ["1017"], "nen: %s,nee: %s" % (repr(nen),repr(nee))

def digit(value):
    isncbi = is_ncbi_id(value)
    isensembl = is_ensembl_id(value)
    print("Value to analyze: %s" % value)
    print("Searching mappings")
    if not isncbi and not isensembl:
        print("This ID can't be found in mappings")
        return
    assert isncbi != isensembl, "Can't be both a NCBI *and* Ensembl ID, weird..."
    print("It's a %s ID" % (isncbi and "NCBI" or "Ensembl"))
    print("\n")
    if isncbi:
        print("Find Ensembl genes mapped to this NCBI ID")
        print("According to NCBI, it's mapped to:")
        ensidfromncbi = sorted(get_ensembl_from_ncbi_according_to_ncbi(value))
        print(ensidfromncbi)
        print("According to Ensembl, it's mapped to:")
        ensidfromensembl = sorted(get_ensembl_from_ncbi_according_to_ensembl(value))
        print(ensidfromensembl)
        print("According to MyGene, it's mapped to:")
        ensidfrommygene = sorted(get_ensembl_from_ncbi_according_to_mygene(value))
        print(ensidfrommygene)
        if ensidfromncbi != ensidfromensembl:
            print("NCBI and Ensembl disagree")
        else:
            print("NCBI and Ensembl agree!")
        ncbi_list = ensidfromncbi
        ensembl_list = ensidfromensembl
        mygene_list = ensidfrommygene
    if isensembl:
        print("Find NCBI genes mapped to this Ensembl ID")
        print("According to NCBI, it's mapped to:")
        entidfromncbi = sorted(get_ncbi_from_ensembl_according_to_ncbi(value))
        print(entidfromncbi)
        print("According to Ensembl, it's mapped to:")
        entidfromensembl = sorted(get_ncbi_from_ensembl_according_to_ensembl(value))
        print(entidfromensembl)
        print("According to MyGene, it's mapped to:")
        entidfrommygene = sorted(get_ncbi_from_ensembl_according_to_mygene(value))
        print(entidfrommygene)
        if entidfromncbi != entidfromensembl:
            print("NCBI and Ensembl disagree")
        else:
            print("NCBI and Ensembl agree!")
        ncbi_list = entidfromncbi
        ensembl_list = entidfromensembl
        mygene_list = entidfrommygene
        
    print("\n")
    # some table report
    table = {}
    cols = set()
    [cols.add(i) for ids in [ncbi_list,ensembl_list,mygene_list] for i in ids]
    for i in cols:
        if i in ncbi_list:
            table.setdefault(i,[]).append("X")
        else:
            table.setdefault(i,[]).append("-")
        if i in ensembl_list:
            table.setdefault(i,[]).append("X")
        else:
            table.setdefault(i,[]).append("-")
        if i in mygene_list:
            table.setdefault(i,[]).append("X")
        else:
            table.setdefault(i,[]).append("-")
    
    I = pandas.Index(["ncbi","ensembl","mygene"], name="%s mapped to" % value)
    d =pandas.DataFrame(data=table,index=I).T
    return d
from ipywidgets import widgets
from IPython.display import display, clear_output
input_symbol = widgets.Text()
def handle_submit(sender):
    res = digit(input_symbol.value)
    print(res)
    print("---")
input_symbol.on_submit(handle_submit)

Examples


In [27]:
digit("110284891") #  see MGP_CAROLIEiJ_G0022186

Value to analyze: 110284891
Searching mappings
It's a NCBI ID


Find Ensembl genes mapped to this NCBI ID
According to NCBI, it's mapped to:
['MGP_CAROLIEiJ_G0022141', 'MGP_CAROLIEiJ_G0022142', 'MGP_CAROLIEiJ_G0022143', 'MGP_CAROLIEiJ_G0022144', 'MGP_CAROLIEiJ_G0022145', 'MGP_CAROLIEiJ_G0022146', 'MGP_CAROLIEiJ_G0022150', 'MGP_CAROLIEiJ_G0022151', 'MGP_CAROLIEiJ_G0022152', 'MGP_CAROLIEiJ_G0022153', 'MGP_CAROLIEiJ_G0022154', 'MGP_CAROLIEiJ_G0022155', 'MGP_CAROLIEiJ_G0022156', 'MGP_CAROLIEiJ_G0022181', 'MGP_CAROLIEiJ_G0022182', 'MGP_CAROLIEiJ_G0022183', 'MGP_CAROLIEiJ_G0022186', 'MGP_CAROLIEiJ_G0022187', 'MGP_CAROLIEiJ_G0022188', 'MGP_CAROLIEiJ_G0022189', 'MGP_CAROLIEiJ_G0022190', 'MGP_CAROLIEiJ_G0022192', 'MGP_CAROLIEiJ_G0022193', 'MGP_CAROLIEiJ_G0022195', 'MGP_CAROLIEiJ_G0022196', 'MGP_CAROLIEiJ_G0022198', 'MGP_CAROLIEiJ_G0022199', 'MGP_CAROLIEiJ_G0022200', 'MGP_CAROLIEiJ_G0022201']
According to Ensembl, it's mapped to:
['MGP_CAROLIEiJ_G0022141', 'MGP_CAROLIEiJ_G0022142', 'MGP_CAROLIEi

110284891 mapped to,ncbi,ensembl,mygene
MGP_CAROLIEiJ_G0022141,X,X,X
MGP_CAROLIEiJ_G0022142,X,X,X
MGP_CAROLIEiJ_G0022143,X,X,X
MGP_CAROLIEiJ_G0022144,X,X,X
MGP_CAROLIEiJ_G0022145,X,X,X
MGP_CAROLIEiJ_G0022146,X,X,X
MGP_CAROLIEiJ_G0022150,X,X,X
MGP_CAROLIEiJ_G0022151,X,X,X
MGP_CAROLIEiJ_G0022152,X,X,X
MGP_CAROLIEiJ_G0022153,X,X,X


In [28]:
digit("ENSPTRG00000017346") # 100034681

Value to analyze: ENSPTRG00000017346
Searching mappings
It's a Ensembl ID


Find NCBI genes mapped to this Ensembl ID
According to NCBI, it's mapped to:
['100034689', '100034690', '100034692', '100034693', '100034694', '100034695', '100034696', '100034697', '471673']
According to Ensembl, it's mapped to:
['100034681', '100034689', '100034690', '100034692', '100034693', '100034694', '100034695', '100034696', '100034697', '471673']
According to MyGene, it's mapped to:
['100034692']
NCBI and Ensembl disagree




ENSPTRG00000017346 mapped to,ncbi,ensembl,mygene
100034681,-,X,-
100034689,X,X,-
100034690,X,X,-
100034692,X,X,X
100034693,X,X,-
100034694,X,X,-
100034695,X,X,-
100034696,X,X,-
100034697,X,X,-
471673,X,X,-


In [29]:
digit("100034681")

Value to analyze: 100034681
Searching mappings
It's a NCBI ID


Find Ensembl genes mapped to this NCBI ID
According to NCBI, it's mapped to:
['ENSPTRG00000048702']
According to Ensembl, it's mapped to:
['ENSPTRG00000017346']
According to MyGene, it's mapped to:
[]
NCBI and Ensembl disagree




100034681 mapped to,ncbi,ensembl,mygene
ENSPTRG00000017346,-,X,-
ENSPTRG00000048702,X,-,-


In [30]:
digit("103790045") # we don't trust Entrez for NCBI->Ensembl mappings

Value to analyze: 103790045
Searching mappings
It's a NCBI ID


Find Ensembl genes mapped to this NCBI ID
According to NCBI, it's mapped to:
['ENSCJAG00000002036']
According to Ensembl, it's mapped to:
['ENSCJAG00000002058']
According to MyGene, it's mapped to:
['ENSCJAG00000002058']
NCBI and Ensembl disagree




103790045 mapped to,ncbi,ensembl,mygene
ENSCJAG00000002036,X,-,-
ENSCJAG00000002058,-,X,X


In [31]:
digit("ENSAMEG00000011997") # ambiguous mapping, symbol matched, 
                            # we kept what Ensembl said but what NCBI said is ignored. This Ensembl ID
                            # comes from Entrez. In mygene, it's alone, as an Ensembl gene.

Value to analyze: ENSAMEG00000011997
Searching mappings
It's a Ensembl ID


Find NCBI genes mapped to this Ensembl ID
According to NCBI, it's mapped to:
['100482314']
According to Ensembl, it's mapped to:
[]
According to MyGene, it's mapped to:
[]
NCBI and Ensembl disagree




ENSAMEG00000011997 mapped to,ncbi,ensembl,mygene
100482314,X,-,-


In [32]:
digit("ENSPTRG00000048702")

Value to analyze: ENSPTRG00000048702
Searching mappings
It's a Ensembl ID


Find NCBI genes mapped to this Ensembl ID
According to NCBI, it's mapped to:
['100034681']
According to Ensembl, it's mapped to:
[]
According to MyGene, it's mapped to:
[]
NCBI and Ensembl disagree




ENSPTRG00000048702 mapped to,ncbi,ensembl,mygene
100034681,X,-,-


Form

In [33]:
clear_output(wait=True)
display(input_symbol)

Text(value='')