In [1]:
%load_ext autoreload
%autoreload 2

# Comparing NeuroLex and BBP ontologies


In [2]:
import sys
sys.path.append('../')
from similarity import _cleanup, _normalize, similarity2

from sherlok import Sherlok
neuroner = Sherlok('neuroner')

In [3]:
# PARSE OBO
import oboparser, re

hbp_obo_file  = 'hbp_cell_ontology.obo'
nlex_obo_file = 'neurolex.obo'

# a simple function to pull out the cell names
SYNONOYM_NAME = re.compile(r'"(.*?)"').search
def get_cell_names(obo_onto_file):
    cell_names = {}    
    for stanza in oboparser.parse(obo_onto_file):
        if stanza['@type'] != "Term": continue
        id = stanza["id"]
        if id == '': continue
        name = stanza["name"]
        synonyms = []
        for s in stanza["synonym"]:
            synonyms.append(SYNONOYM_NAME(s).group(1))
        cell_names[id] = (name, synonyms)
    return cell_names

hbp_cell_names  = get_cell_names(hbp_obo_file)
nlex_cell_names = get_cell_names(nlex_obo_file)
print 'hbp_cell_names', len(hbp_cell_names)
print 'nlex_cell_names', len(nlex_cell_names)
assert hbp_cell_names['HBP_CELL:0000016'][0] == 'neuron expressing Parvalbumin'
assert hbp_cell_names['HBP_CELL:0000016'][1][1] == 'Pvalb+ cell'

hbp_cell_names 133
nlex_cell_names 750


In [4]:
# PREPROCESS NEUROLEX and HBP
def preprocess(cell_names, prefix=''):
    cell_names_processed = {}
    for id, entity in cell_names.items():
        name, synonyms = entity
        print('name::', name, "id:", id, "prefix:", prefix)
        variants = []
        variants.append(_cleanup(neuroner.annotate(prefix + name).annotations)) # name
        for s in synonyms:
            print('         syn::', s)
            variants.append(_cleanup(neuroner.annotate(prefix + s).annotations)) # synonyms
        cell_names_processed[id] = variants
    return cell_names_processed

hbp_cell_names_processed  = preprocess(hbp_cell_names, prefix='neocortex ')
nlex_cell_names_processed = preprocess(nlex_cell_names)
print('done :-)')

('name::', 'Axonal Cell', 'id:', 'HBP_CELL:0000065', 'prefix:', 'neocortex ')
('name::', 'Nest Basket Cell', 'id:', 'HBP_CELL:0000064', 'prefix:', 'neocortex ')
('         syn::', 'arcade cell')
('         syn::', 'willow cell')
('         syn::', 'NBC')
('name::', 'Large Axon Cell', 'id:', 'HBP_CELL:0000067', 'prefix:', 'neocortex ')
('         syn::', 'LAC')
('name::', 'Descending Axon Cell', 'id:', 'HBP_CELL:0000066', 'prefix:', 'neocortex ')
('         syn::', 'DAC')
('name::', 'Cajal-Retzius cell like cell', 'id:', 'HBP_CELL:0000061', 'prefix:', 'neocortex ')
('         syn::', 'CrC')
('name::', 'Chandelier Cell', 'id:', 'HBP_CELL:0000060', 'prefix:', 'neocortex ')
('         syn::', 'Axo-axonic cell')
('         syn::', 'ChC')
('name::', 'Large Basket Cell', 'id:', 'HBP_CELL:0000063', 'prefix:', 'neocortex ')
('         syn::', 'LBC')
('name::', 'Small Basket Cell', 'id:', 'HBP_CELL:0000062', 'prefix:', 'neocortex ')
('         syn::', 'SBC')
('name::', 'Layer V Neurogliaform Cel

In [5]:
import pickle
pickle.dump(hbp_cell_names_processed,  open('hbp_cell_names_processed.pckl',  'wb'))
pickle.dump(nlex_cell_names_processed, open('nlex_cell_names_processed.pckl', 'wb'))

In [6]:
print hbp_cell_names['HBP_CELL:0000064']
print hbp_cell_names_processed['HBP_CELL:0000064']

('Nest Basket Cell', ['arcade cell', 'willow cell', 'NBC'])
[[u'ABA_REGION:315', u'HBP_MORPHOLOGY:0000018', u'HBP_MORPHOLOGY:0000019'], [u'ABA_REGION:315', 'Missing:arcade'], [u'ABA_REGION:315', 'Missing:willow'], [u'ABA_REGION:315', u'NCBI_GENE:54403']]


In [17]:
print nlex_cell_names['nifext_56']
print nlex_cell_names_processed['nifext_56']

('Neocortex basket cell', ['cortical basket cell', 'basket cell', 'cortical basket neuron', 'Neocortical basket cell', 'parvalbumin interneuron'])
[[u'ABA_REGION:315', u'HBP_MORPHOLOGY:0000019'], [u'ABA_REGION:688', u'HBP_MORPHOLOGY:0000019'], [u'HBP_MORPHOLOGY:0000019'], [u'ABA_REGION:688', u'HBP_MORPHOLOGY:0000019'], [u'ABA_REGION:315', u'HBP_MORPHOLOGY:0000019'], [u'NCBI_GENE:19293']]


In [18]:
similarity2(hbp_cell_names_processed['HBP_CELL:0000064'][0], nlex_cell_names_processed['nifext_56'][0])

(0.8333333333333334,
 [[],
  (['ABA_REGION:315'], 'exact same brain region'),
  [],
  [],
  ([u'HBP_MORPHOLOGY:0000019'], 'shares morphology'),
  [],
  [],
  []])

In [20]:
#%debug
hbp_hits={}
for hbp_id, hbp_cell_variants in hbp_cell_names_processed.items():
    hbp_variant = hbp_cell_variants[0] # just the 1st one TODO
    if len(hbp_variant) == 0:
        print('NO VARIANT FOR ', hbp_id, hbp_cell_names[hbp_id])
    else:
        hits = []
        for nlex_id, nlex_cell_variants in nlex_cell_names_processed.items():
            for variant in nlex_cell_variants:
                if len(variant) == 0: continue
                #print("sim: HBP::",hbp_variant, "NLEX::",variant, "nlex_id",nlex_id)
                sim = similarity2(hbp_variant, variant, use_inter_similarity=True)
                if sim[0] > 0:
                    #print('hit for',hbp_variant, 'WITH',variant[0])
                    hits.append((nlex_id,) + sim)
        hbp_hits[hbp_id] = hits
print('hits', len(hbp_hits))
#hit: (nlex_id, sim_score, explanations...)

('hits', 133)


In [21]:
hbp_hits['HBP_CELL:0000064']

[('nlx_143940',
  0.3673469387755102,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nifext_134',
  0.5,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156753',
  0.3103448275862069,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156753',
  0.6,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156750',
  0.3103448275862069,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156750',
  0.6,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156754',
  0.3103448275862069,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156754',
  0.6,
  [[],
   (['ABA_REGION:315'], 'exa

In [60]:
#%debug

import collections
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, basestring):
            for sub in flatten(el):
                yield sub
        else:
            yield el

def get_key(item):
    return item[1]

with open('hbp-neurolex.tsv', 'w') as outf:
    outf.write('BBP label	BBP rdf	Neurolex name	%	Neurolex ID\n')
    for hbp_id, hits in hbp_hits.items():
        if len(hits) > 0:
            hits_sorted = sorted(hits, key=get_key, reverse=True)

            #print('{} ({})'.format(hbp_cell_names[hbp_id][0], hbp_id))
            already_printed = []
            for nlex_id, score, explain in hits_sorted[:5]:
                
                # only if more than Neocortex
                explain_str = ' '.join(flatten(explain))
                
                if nlex_id not in already_printed and explain_str != 'ABA_REGION:315 exact same brain region':
                    already_printed.append(nlex_id)
                    #print explain_str
                    #print '* {} ({}, {})'.format(nlex_cell_names[nlex_id][0], round(score, 2), nlex_id)
                    outf.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(hbp_cell_names[hbp_id][0], hbp_id, nlex_cell_names[nlex_id][0], round(score, 2), nlex_id, explain_str))


In [45]:
cell_= 'Nest Basket Cell' #'nest basket cell'
_cleanup(neuroner.annotate(cell_).annotations)


[u'HBP_MORPHOLOGY:0000018', u'HBP_MORPHOLOGY:0000019']

In [47]:
sorted(hbp_hits['HBP_CELL:0000061'], key=get_key, reverse=True)[:5]

[('nlx_cell_20081206',
  1.0,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   ([u'HBP_MORPHOLOGY:0000008'], 'shares morphology'),
   [],
   [],
   []]),
 ('nlx_156753',
  0.75,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156750',
  0.75,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_156754',
  0.75,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []]),
 ('nlx_157255',
  0.75,
  [[],
   (['ABA_REGION:315'], 'exact same brain region'),
   [],
   [],
   [],
   [],
   [],
   []])]