In [1]:
%load_ext autoreload
%autoreload 2

# Comparing NeuroLex and BBP ontologies


In [2]:
from similarity import _cleanup, _normalize,   similarity2

from sherlok import Sherlok
neuroner = Sherlok('neuroner')

In [3]:
# PARSE OBO
import oboparser, re

hbp_obo_file = 'hbp_neurolex/hbp_cell_ontology.obo'
nlex_obo_file = 'hbp_neurolex/neurolex.obo'

# a simple function to pull out the cell names
SYNONOYM_NAME = re.compile(r'"(.*?)"').search
def get_cell_names(obo_onto_file):
    cell_names = {}    
    for stanza in oboparser.parse(obo_onto_file):
        if stanza['@type'] != "Term": continue
        id = stanza["id"]
        if id == '': continue
        name = stanza["name"]
        synonyms = []
        for s in stanza["synonym"]:
            synonyms.append(SYNONOYM_NAME(s).group(1))
        cell_names[id] = (name, synonyms)
    return cell_names

hbp_cell_names  = get_cell_names(hbp_obo_file)
nlex_cell_names = get_cell_names(nlex_obo_file)
print 'hbp_cell_names', len(hbp_cell_names)
print 'nlex_cell_names', len(nlex_cell_names)
assert hbp_cell_names['HBP_CELL:0000016'][0] == 'neuron expressing Parvalbumin'
assert hbp_cell_names['HBP_CELL:0000016'][1][1] == 'Pvalb+ cell'

hbp_cell_names 133
nlex_cell_names 750


In [4]:
# PREPROCESS NEUROLEX and HBP
def preprocess(cell_names):
    cell_names_processed = {}
    for id, entity in cell_names.items():
        name, synonyms = entity
        print('name::', name, "id:", id)
        variants = []
        variants.append(_cleanup(neuroner.annotate(name).annotations)) # name
        for s in synonyms:
            print('         syn::',s)
            variants.append(_cleanup(neuroner.annotate(name).annotations)) # synonyms
        cell_names_processed[id] = variants
    return cell_names_processed

hbp_cell_names_processed  = preprocess(hbp_cell_names)
nlex_cell_names_processed = preprocess(nlex_cell_names)
print('done :-)')

('name::', 'Axonal Cell', 'id:', 'HBP_CELL:0000065')
('name::', 'Nest Basket Cell', 'id:', 'HBP_CELL:0000064')
('         syn::', 'arcade cell')
('         syn::', 'willow cell')
('         syn::', 'NBC')
('name::', 'Large Axon Cell', 'id:', 'HBP_CELL:0000067')
('         syn::', 'LAC')
('name::', 'Descending Axon Cell', 'id:', 'HBP_CELL:0000066')
('         syn::', 'DAC')
('name::', 'Cajal-Retzius cell like cell', 'id:', 'HBP_CELL:0000061')
('         syn::', 'CrC')
('name::', 'Chandelier Cell', 'id:', 'HBP_CELL:0000060')
('         syn::', 'Axo-axonic cell')
('         syn::', 'ChC')
('name::', 'Large Basket Cell', 'id:', 'HBP_CELL:0000063')
('         syn::', 'LBC')
('name::', 'Small Basket Cell', 'id:', 'HBP_CELL:0000062')
('         syn::', 'SBC')
('name::', 'Layer V Neurogliaform Cell', 'id:', 'HBP_CELL:0000115')
('         syn::', 'L5_NGC')
('name::', 'Layer IV Neurogliaform Cell', 'id:', 'HBP_CELL:0000114')
('         syn::', 'L4_NGC')
('name::', 'Layer II/III Martinotti Cell',

In [5]:
import pickle
pickle.dump(hbp_cell_names_processed,  open('hbp_cell_names_processed.pckl',  'wb'))
pickle.dump(nlex_cell_names_processed, open('nlex_cell_names_processed.pckl', 'wb'))

In [13]:
print hbp_cell_names_processed['HBP_CELL:0000064']

[[u'HBP_MORPHOLOGY:0000019'],
 [[u'HBP_MORPHOLOGY:0000019']],
 [[u'HBP_MORPHOLOGY:0000019']],
 [[u'HBP_MORPHOLOGY:0000019']]]

In [21]:
#print nlex_cell_names['nlx_cell_1006021']
#print nlex_cell_names_processed['nlx_cell_1006021']

('Spinal cord ventral horn interneuron V0C', ['V0c interneuron', 'spinal cord V0c interneuron'])
[[u'UNKN_REGION:4645', u'UNKN_REGION:240'], [[u'UNKN_REGION:4645', u'UNKN_REGION:240']], [[u'UNKN_REGION:4645', u'UNKN_REGION:240']]]


In [43]:
#%debug
hbp_hits={}
for hbp_id, hbp_cell_variants in hbp_cell_names_processed.items():
    hbp_variant = hbp_cell_variants[0] # just the 1st one TODO
    if len(hbp_variant) == 0:
        print('NO VARIANT FOR ', hbp_id, hbp_cell_names[hbp_id])
    else:
        hits = []
        for nlex_id, nlex_cell_variants in nlex_cell_names_processed.items():
            for variant in nlex_cell_variants:
                if len(variant) == 0: continue
                #print("sim: HBP::",hbp_variant, "NLEX::",variant, "nlex_id",nlex_id)
                sim = similarity2(hbp_variant, variant[0], use_inter_similarity=True)
                if sim[0] > 0:
                    hits.append((nlex_id,) + sim)
        hbp_hits[hbp_id] = hits
print('hits', len(hbp_hits))
#hit: (nlex_id, sim_score, explanations...)

('NO VARIANT FOR ', 'HBP_CELL:0000065', ('Axonal Cell', []))
('NO VARIANT FOR ', 'HBP_CELL:0000010', ('neuron classified based on gene expression', []))
('NO VARIANT FOR ', 'HBP_CELL:0000003', ('neuron classified base morphological properties', []))
('NO VARIANT FOR ', 'HBP_CELL:0000001', ('neuron', []))
('NO VARIANT FOR ', 'HBP_CELL:0000000', ('cell', []))
('NO VARIANT FOR ', 'HBP_CELL:0000007', ('neuron classified based on neurotransmitter release', []))
('NO VARIANT FOR ', 'HBP_CELL:0000006', ('neuron classified based on connection properties', []))
('NO VARIANT FOR ', 'HBP_CELL:0000005', ('neuron classified based on molecular expression', []))
('NO VARIANT FOR ', 'HBP_CELL:0000004', ('neuron classified base electrical properties', []))
('NO VARIANT FOR ', 'HBP_CELL:0000009', ('neuron classified based on protein expression', []))
('NO VARIANT FOR ', 'HBP_CELL:0000033', ('neuron classified based on location', []))
('hits', 122)


In [56]:
#hbp_hits['HBP_CELL:0000028']

In [55]:
def get_key(item):
    return item[0]

for hbp_id, hits in hbp_hits.items():
    if len(hits) > 0:
        hits_sorted = sorted(hits, key=get_key)
        
        print('{} ({})'.format(hbp_cell_names[hbp_id][0], hbp_id))
        already_printed = []
        for nlex_id, score, explain in hits_sorted[:5]:
            if nlex_id not in already_printed:
                already_printed.append(nlex_id)
                print '* {} ({}, {})'.format(nlex_cell_names[nlex_id][0], round(score, 2), nlex_id)
        print ''

Nest Basket Cell (HBP_CELL:0000064)
* Bed nucleus of the stria terminalis basket neuron (0.57, BAMSC988)
* Neocortex basket cell (0.57, nifext_56)

Cajal-Retzius cell like cell (HBP_CELL:0000061)
* Neocortex Cajal-Retzius cell (0.57, nlx_cell_20081206)

Chandelier Cell (HBP_CELL:0000060)
* Bed nucleus of the stria terminalis spiny neuron with chandelier-like axon (0.44, BAMSC992)
* Neocortex chandelier cell (0.57, nifext_57)

Large Basket Cell (HBP_CELL:0000063)
* Bed nucleus of the stria terminalis basket neuron (0.57, BAMSC988)
* Neocortex basket cell (0.57, nifext_56)

Small Basket Cell (HBP_CELL:0000062)
* Bed nucleus of the stria terminalis basket neuron (0.57, BAMSC988)
* Neocortex basket cell (0.57, nifext_56)

Layer V Neurogliaform Cell (HBP_CELL:0000115)
* Neocortex pyramidal cell layer 5-6 (0.47, nifext_50)

Layer IV Neurogliaform Cell (HBP_CELL:0000114)
* Neocortex stellate layer 4 cell (0.6, nifext_53)
* Neocortex Dorsolateral prefrontal area layer 4 Calbindin cell (0.54, n