In [46]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
from similarity import similarity
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
similarity('Periventricular zone layer 5a neuron', 'Hypothalamic medial zone layer 5-6 neuron')

(1.25,
 [(['HBP_LAYER:0000005'], 'shares layers'),
  (['ABA_REGION:1097'], 'sibling regions'),
  [],
  [],
  [],
  [],
  [],
  []])

In [3]:
import pandas as pd


# download neuroelectro summary spreadsheet from URL and import into pandas data frame
neuroelectro_dataset_url = "http://dev.neuroelectro.org/static/src/article_ephys_metadata_curated.csv"
table = pd.read_csv(neuroelectro_dataset_url, sep='\t', index_col = 0, header=0)

In [23]:
big_onto = load_ontologies()

In [24]:
name_spaces = [('NCBI_GENE', 'gene'), ('HBP_LAYER', 'layer'), ('HBP_MORPHOLOGY', 'shape'), ('REGION', 'region'),
              ('HBP_EPHYS', 'ephys'), ('MOUSE_LINE', 'line'),]
proj_name = 'projection'

proj_list = ['CStr', 'CSp', 'CThal', 'CCo', 'CTrig', 'CTect']

def annots_to_dict(annot_list):
    temp_dict = {}
    for annotation in annot_list:
        for s in name_spaces:
            if s[0] in annotation:
                term = _normalize([annotation], shorten = True)
                if term in proj_list:
                    temp_dict[proj_name] = term
                    continue
                if s[1] not in temp_dict:
                    temp_dict[s[1]] = term
                else:
                    temp_dict[s[1]] = '%s %s' %  (temp_dict[s[1]],  term)
    return temp_dict

In [25]:
def add_animal_strain_to_neuron(input_neuron_name, strain):
    strain_list = ['GIN', 'G42', 'GAD65-GFP', 'Thy1-GFP']
    output_neuron_name = input_neuron_name

    if strain in strain_list:
        output_neuron_name += ' from %s mouse line' % strain
    return output_neuron_name
def check_strain(neuron_name):
    strain_list = ['GIN', 'G42', 'GAD65-GFP', 'Thy1-GFP']
    for s in strain_list:
        if s in neuron_name:
            return 'MOUSE_LINE:%s' % s
    return None
check_strain('Hypothalamic medial zone dopaminergic neuron from GIN mouse line')

'MOUSE_LINE:GIN'

In [48]:
from sherlok import Sherlok
s = Sherlok('neuroner')

from similarity import _cleanup, _normalize

neuron_name = 'Neocortex corticostriatal NPY somatostatin-expressing with depolarizing fast spiking AHP Layer 5a pyramidal cell from GIN mouse line'
r = s.annotate(neuron_name)
#annot_list = format_annots(neuron_name, r.annotations)
al = _cleanup(r.annotations, neuron_name)
if check_strain(neuron_name):
    al.append(check_strain(neuron_name))
print _normalize(al, shorten = True)

#print annot_list
print al

Isocortex CStr Npy Sst with depolarizing F S AHP L5a Pyr from GIN mouse line GIN
[u'ABA_REGION:315', 'HBP_PROJECTION:7322', u'NCBI_GENE:109648', u'NCBI_GENE:20604', 'Missing:with depolarizing', u'HBP_EPHYS:0000080', u'HBP_EPHYS_TRIGGER:0000003', 'Missing:AHP', u'HBP_LAYER:0000050', u'HBP_MORPHOLOGY:0000001', 'Missing:from GIN', 'Species:mouse', 'Missing:line', 'MOUSE_LINE:GIN']


In [49]:
# use sherlok to annotate each curated neuron mention in neuroelectro
annot_vec = []
dict_list = []
for index, row in table.iterrows():
    neuron_name = row['NeuronLongName']
    strain = unicode(row['Strain'])
    # if no neuron long name, use neurolex neuron name (should always be available)
    if pd.isnull(neuron_name):
        neuron_name = row['NeuronName']
    neuron_name = add_animal_strain_to_neuron(neuron_name, strain)
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    
    # hack for adding strain annotations, since not in neuroNER
    if check_strain(neuron_name):
        cleaned_annotations.append(check_strain(neuron_name))
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['neuron_name'] = neuron_name
    temp_dict['short_name'] = short_annots
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [9]:
dict_list[4]

{'Index': 949, 'region': 'HIP', 'short_name': 'glu HIP autaptic'}

In [50]:
col_names = [n[1] for n in name_spaces]
col_names.append('projection')

df = pd.DataFrame.from_records(dict_list, index = 'Index')
df

table[0:4]

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,spike rise time,access resistance,spike decay time,fast AHP duration,cell diameter,slow AHP duration,medium AHP amplitude,cell surface area,ADP duration,medium AHP duration
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1011,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1012,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1018,Rac1 and rac3 GTPases control synergistically ...,23258346,2014,de Curtis I,88539,36200,Hippocampus CA3 pyramidal cell,,Hippocampal formation,True,...,,,,,,,,,,


In [51]:
table = table.join(df)

In [66]:
table.to_csv('df.csv', sep='\t', encoding='utf-8')

In [52]:
table.head()

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,medium AHP duration,annotations,ephys,gene,layer,line,neuron_name,region,shape,short_name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1011,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1012,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1018,Rac1 and rac3 GTPases control synergistically ...,23258346,2014,de Curtis I,88539,36200,Hippocampus CA3 pyramidal cell,,Hippocampal formation,True,...,,"[ABA_REGION:463, HBP_MORPHOLOGY:0000001]",,,,,Hippocampus CA3 pyramidal cell,CA3,Pyr,CA3 Pyr
949,Loss of mTOR repressors Tsc1 or Pten has diver...,24574959,2014,Swann JW,85608,27436,Other,Glutamatergic hippocampal autaptic neuron,,True,...,,"[HBP_NEUROTRANSMITTER:0000004, ABA_REGION:1080...",,,,,Glutamatergic hippocampal autaptic neuron,HIP,,glu HIP autaptic


In [53]:
# load in csv of Design.csv dataset

gene_exp_path = '/Users/shreejoy/brainCellTypeSpecificGenes/data/Design.txt'
gene_exp_table = pd.read_csv(gene_exp_path, sep='\t', index_col = 0, header=0)


In [54]:
gene_exp_table.tail()

Unnamed: 0_level_0,Description,Region,Anatomical.Region,neuroNER Precise Name,neuroNER Inferred Properties,GabaDeep,PyramidalDeep,CellType,forContanim,Normalize2.0,...,Age,AgeExact,Method,RNA.isolation.method,RNA.amplification.and.labeling.method,Platform,Reference,PMID,SharePermit,Notes
Cell.Type..Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"GSM215871,GSM215872,GSM215873",Tyrosine Hydroxylase Expressing,LocusCereuleus,LocusCereuleus,Locus Cereuleus Th-expressing cell,Locus Cereuleus Th-expressing noradrenergic cell,Th_positive_LC,Th_positive_LC,Th_positive_LC,Th_positive_LC,T,...,,39,Manual,,,GPL1261,Sugino et al. 2014,25232122.0,T,
"GSM444379,GSM444380,GSM444381",Purkinje Cells from G42,Cerebellum,Cerebellum,Cerebellum Purkinje cell,Cerebellum Purkinje cell from G42 mouse line,Purkinje,Purkinje,Purkinje,Purkinje,T,...,,38,Manual,,,GPL1261,Sugino et al. 2014,25232122.0,T,
"GSM1464137,GSM1464138,GSM1464139",Tyrosine Hydroxylase Expressing young,LocusCereuleus,LocusCereuleus,Locus Cereuleus Th-expressing cell,Locus Cereuleus Th-expressing noradrenergic cell,Th_positive_LC,Th_positive_LC,Th_positive_LC,Th_positive_LC,T,...,,23,Manual,,,GPL1261,Sugino et al. 2014,25232122.0,T,
"GSM437382,GSM437383,GSM437384",A9 Dopaminergic Neurons,Midbrain,Substantia Nigra,Substantia nigra pars compacta Th-expressing cell,Substantia nigra pars compacta Th-expressing d...,Dopaminergic,Dopaminergic,Dopaminergic,Dopaminergic,T,...,,Adult,LCM,,,GPL1261,Phani et al. 2015,,T,
"GSM437391,GSM437392,GSM437393",A10 Dopaminergic Neurons,Midbrain,ventral midbrain tagmentum,Ventral tegmental area Th-expressing cell,Ventral tegmental area Th-expressing dopaminer...,Dopaminergic,Dopaminergic,Dopaminergic,Dopaminergic,T,...,,Adult,LCM,,,GPL1261,Phani et al. 2015,,T,


In [55]:

# use sherlok to annotate each curated neuron mention in neuroelectro
#annot_vec = []
gene_dict_list = []
for index, row in gene_exp_table.iterrows():
    neuron_name = row['neuroNER Inferred Properties']
    
    # if no neuron long name, just move on
    if pd.isnull(neuron_name):
        continue
        # neuron_name = row['NeuronName']
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    if check_strain(neuron_name):
        cleaned_annotations.append(check_strain(neuron_name))
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['short_name'] = short_annots
    temp_dict['neuron_name'] = neuron_name
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    gene_dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [None]:
strain_list = ['GIN', 'G42', 'GAD65-GFP', 'Thy1-GFP']
for index, row in table.iterrows():
    strain = unicode(row['Strain'])
    neuron_name = row['NeuronLongName']
    if pd.isnull(neuron_name):
        neuron_name = row['NeuronName']
    if strain in strain_list:
        print strain
        neuron_name += ' from %s mouse line' % strain
        print neuron_name

'MOUSE_LINE:GIN'

In [56]:
df_gene = pd.DataFrame.from_records(gene_dict_list, index = 'Index')
gene_exp_table = gene_exp_table.join(df_gene)

In [29]:
gene_exp_table.to_csv('df_gene.csv', sep='\t', encoding='utf-8')

In [58]:
from similarity import similarity, similarity2
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
similarity('spinal cord hippocampus CA3 neuron', 'spinal cord hippocampus CA1 neuron')

(1.0,
 [[],
  (['ABA_REGION:1089'], 'sibling regions'),
  [],
  ([u'UNKN_REGION:4645'], 'shares general regions'),
  [],
  [],
  [],
  []])

In [59]:
similarity2(gene_dict_list[25]['annotations'], dict_list[25]['annotations'])

(4.75,
 [(['HBP_LAYER:0000005'], 'shares layers'),
  (['ABA_REGION:315'], 'sibling regions'),
  (['HBP_PROJECTION:22'], 'shares projection patterns'),
  [],
  ([u'HBP_MORPHOLOGY:0000001'], 'shares morphology'),
  [],
  [],
  []])

In [60]:
print dict_list[25]['short_name']
print gene_dict_list[25]['short_name']

SSs L5b CSp TT large Pyr
MO L5 CSp Pyr


In [70]:
for g in gene_dict_list:
    curr_list = g
    sim_vec = []
    sim_val_vec = []
    for d in dict_list:
        sim = similarity2(curr_list['annotations'], d['annotations'])
        sim_vec.append(sim)
        sim_val_vec.append(sim[0])
    sim_val_vec
    inds = sorted(range(len(sim_val_vec)),key=lambda x:sim_val_vec[x], reverse = True)
    print 'target name -> %s' % curr_list['neuron_name']
    for i in range(0,10):
        print 'match value: %.2f, best match: %s' % (sim_val_vec[inds[i]], dict_list[inds[i]]['neuron_name'])
    print '\n'

target name -> Neocortex layer 5a Etv1-expressing slender-tufted pyramidal cells
match value: 0.66, best match: barrel cortex pyramidal neurons layer 5a
match value: 0.66, best match: Layer 5a somatosensory cortex barrel column small tuft pyramidal neurons
match value: 0.66, best match: Layer 5a somatosensory barrel cortex pyramidal neurons
match value: 0.66, best match: Layer 5a somatosensory barrel cortex pyramidal neurons
match value: 0.66, best match: Layer 5a somatosensory barrel cortex pyramidal neurons
match value: 0.66, best match: somatosensory barrel cortex layer 5a pyramidal cell
match value: 0.65, best match: Neocortex Layer V Pyramidal Cell
match value: 0.65, best match: neocortex layer 5 large pyramidal cell
match value: 0.65, best match: Layer 5 neocortex pyramidal neuron
match value: 0.65, best match: Neocortex layer 5 pyramidal neuron


target name -> Neocortex layer 5b Glt25d2-expressing thick-tufted pyramidal cells
match value: 0.80, best match: Secondary somatosenso

In [135]:
reload(similarity)

TypeError: reload() argument must be module

In [None]:
print 'target name -> %s' % curr_list['short_name']
for i in range(1,50):
    print dict_list[inds[i]]['short_name']