In [46]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
from similarity import similarity
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
similarity('Periventricular zone neuron', 'Hypothalamic medial zone neuron')

(0.75, [[], (['ABA_REGION:1097'], 'sibling regions'), [], [], [], []])

In [8]:
import pandas as pd


# download neuroelectro summary spreadsheet from URL and import into pandas data frame
neuroelectro_dataset_url = "http://dev.neuroelectro.org/static/src/article_ephys_metadata_curated.csv"
table = pd.read_csv(neuroelectro_dataset_url, sep='\t', index_col = 0, header=0)

In [30]:
from sherlok import Sherlok
s = Sherlok('neuroner')

from similarity import _cleanup, _normalize

neuron_name = 'Neocortex corticostriatal NPY somatostatin-expressing with depolarizing fast spiking AHP Layer 5a pyramidal cell'
r = s.annotate(neuron_name)
#annot_list = format_annots(neuron_name, r.annotations)
al = _cleanup(r.annotations, neuron_name)
print _normalize(al, shorten = True)

#print annot_list
print al

Isocortex CStr Npy Sst with depolarizing F S AHP L5a Pyr
[u'ABA_REGION:315', u'UNKN_REGION:7322', u'NCBI_GENE:109648', u'NCBI_GENE:20604', 'Missing:with depolarizing', u'HBP_EPHYS:0000080', u'HBP_EPHYS_TRIGGER:0000003', 'Missing:AHP', u'HBP_LAYER:0000050', u'HBP_MORPHOLOGY:0000001']


In [31]:
big_onto = load_ontologies()


In [32]:
name_spaces = [('NCBI_GENE', 'gene'), ('HBP_LAYER', 'layer'), ('HBP_MORPHOLOGY', 'shape'), ('REGION', 'region'),
              ('HBP_EPHYS', 'ephys'), ]
proj_name = 'projection'

proj_list = ['CStr', 'CSp', 'CThal', 'CCo']

def annots_to_dict(annot_list):
    temp_dict = {}
    for annotation in annot_list:
        for s in name_spaces:
            if s[0] in annotation:
                term = _normalize([annotation], shorten = True)
                if term in proj_list:
                    temp_dict[proj_name] = term
                    continue
                if s[1] not in temp_dict:
                    temp_dict[s[1]] = term
                else:
                    temp_dict[s[1]] = '%s %s' %  (temp_dict[s[1]],  term)
    return temp_dict

In [33]:
# use sherlok to annotate each curated neuron mention in neuroelectro
annot_vec = []
dict_list = []
for index, row in table.iterrows():
    neuron_name = row['NeuronLongName']
    
    # if no neuron long name, use neurolex neuron name (should always be available)
    if pd.isnull(neuron_name):
        neuron_name = row['NeuronName']
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['short_name'] = short_annots
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [9]:
dict_list[4]

{'Index': 949, 'region': 'HIP', 'short_name': 'glu HIP autaptic'}

In [34]:
col_names = [n[1] for n in name_spaces]
col_names.append('projection')

df = pd.DataFrame.from_records(dict_list, index = 'Index')
df

table[0:4]

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,spike rise time,access resistance,spike decay time,fast AHP duration,slow AHP duration,cell diameter,medium AHP amplitude,cell surface area,ADP duration,medium AHP duration
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1011,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1012,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1018,Rac1 and rac3 GTPases control synergistically ...,23258346,2014,de Curtis I,88539,36200,Hippocampus CA3 pyramidal cell,,Hippocampal formation,True,...,,,,,,,,,,


In [65]:
table = table.join(df)

In [66]:
table.to_csv('df.csv', sep='\t', encoding='utf-8')

In [56]:
table

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,fast AHP duration,cell diameter,slow AHP duration,medium AHP amplitude,ADP duration,cell surface area,medium AHP duration,region,shape,short_name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
954,Loss of mTOR repressors Tsc1 or Pten has diver...,24574959,2014,Swann JW,85608,27436,Other,Glutamatergic hippocampal autaptic neuron,,True,...,,,,,,,,HIP,,glu HIP autaptic
955,Loss of mTOR repressors Tsc1 or Pten has diver...,24574959,2014,Swann JW,85608,27436,Other,Glutamatergic hippocampal autaptic neuron,,True,...,,,,,,,,HIP,,glu HIP autaptic
451,Greater excitability and firing irregularity o...,24614745,2014,Urban NN,85554,,Olfactory bulb (main) mitral cell,,Olfactory areas,False,...,,,,,,,,MOB,mitral,MOB (main) mitral
964,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27636,Amygdala basolateral nucleus pyramidal neuron,Basolateral amygdala principal neurons with in...,Cortical subplate,True,...,,,,,,,,basolateral sAMY ventral CA,,basolateral sAMY principal with input from ven...
965,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27636,Amygdala basolateral nucleus pyramidal neuron,Basolateral amygdala principal neuron with inp...,Cortical subplate,True,...,,,,,,,,basolateral sAMY medial prefrontal cortex CTX,,basolateral sAMY principal with input from med...
971,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27639,Neocortex interneuron deep,ventral hippocampus stimulated interneurons in...,,True,...,,,,,,,,ventral CA basolateral sAMY,,ventral CA stimulated stimulated in basolatera...
972,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27639,Neocortex interneuron deep,medial prefrontal cortex stimulated interneuro...,,True,...,,,,,,,,,,
1002,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27639,Neocortex interneuron deep,ventral hippocampus stimulated fast-spiking in...,,True,...,,,,,,,,,,
1003,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27639,Neocortex interneuron deep,medial prefrontal cortex stimulated fast-spiki...,,True,...,,,,,,,,,,
1004,Ex vivo dissection of optogenetically activate...,24634648,2014,Ehrlich I,85710,27639,Neocortex interneuron deep,medial prefrontal cortex stimulated non-fast s...,,True,...,,,,,,,,,,


In [58]:
# load in csv of Design.csv dataset

gene_exp_path = '/Users/shreejoy/brainCellTypeSpecificGenes/data/Design.tsv'
gene_exp_table = pd.read_csv(gene_exp_path, sep='\t', index_col = 0, header=0)


In [59]:
gene_exp_table.head()

Unnamed: 0_level_0,Description,Region,Anatomical.Region,neuroNER Precise Name,neuroNER Inferred Properties,GabaDeep,PyramidalDeep,CellType,forContanim,Normalize2.0,...,Age,AgeExact,Method,RNA.isolation.method,RNA.amplification.and.labeling.method,Platform,Reference,PMID,SharePermit,Notes
Cell.Type..Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"H_lva_cortex:v2_GSM337817,v2_GSM337818,v2_GSM337819",Pyramidal Neurons,Neocortex,Layer 5A Neocortex,Neocortex Etv1-expressing cells,Neocortex layer 5a Etv1-expressing slender-tuf...,,,,Pyramidal,T,...,46-80,Adult,TRAP,RNeasy Micro Kit,T7 IVT,GPL1261,"Doyle et al., 2008",19013282,T,Astrocyte contamination
"H_lvb_cortex:v2_GSM337821,v2_GSM337822,v2_GSM337823",Pyramidal Neurons,Neocortex,Layer 5B Neocortex,Neocortex Glt25d2-expressing cells,Neocortex layer 5b Glt25d2-expressing thick-tu...,,,,Pyramidal,T,...,46-80,Adult,TRAP,RNeasy Micro Kit,T7 IVT,GPL1261,"Doyle et al., 2008",19013282,T,Astrocyte contamination
"H_l6_cortex:v2_GSM337837,v2_GSM337838,v2_GSM337839",Pyramidal Neurons,Neocortex,Layer 6 Neocortex,Neocortex Ntsr1-expressing cells,Neocortex layer 6 Ntsr1-expressing pyramidal c...,,,,Pyramidal,T,...,46-80,Adult,TRAP,RNeasy Micro Kit,T7 IVT,GPL1261,"Doyle et al., 2008",19013282,T,Astrocyte contamination
"H_cck+_cortex:v2_GSM337769,v2_GSM337770,v2_GSM337771",Mixed Neurons,Neocortex,Neocortex,Neocortex Cck-expressing cell,Neocortex Cck-expressing cell,,,,,F,...,46-80,Adult,TRAP,RNeasy Micro Kit,T7 IVT,GPL1261,"Doyle et al., 2008",19013282,T,
"H_mn_brainstem:v2_GSM337773,v2_GSM337774,v2_GSM337775","Motor Neurons, Midbrain Cholinergic Neurons",Brainstem,Brainstem,Brainstem Chat-expressing cells,Brainstem Chat-expressing cholinergic motor ne...,MotorCholin,MotorCholin,MotorCholin,MotorCholin,T,...,46-80,Adult,TRAP,RNeasy Micro Kit,T7 IVT,GPL1261,"Doyle et al., 2008",19013282,T,


In [60]:

# use sherlok to annotate each curated neuron mention in neuroelectro
#annot_vec = []
gene_dict_list = []
for index, row in gene_exp_table.iterrows():
    neuron_name = row['neuroNER Inferred Properties']
    
    # if no neuron long name, just move on
    if pd.isnull(neuron_name):
        continue
        # neuron_name = row['NeuronName']
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['short_name'] = short_annots
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    gene_dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [61]:
df_gene = pd.DataFrame.from_records(gene_dict_list, index = 'Index')
df_gene

Unnamed: 0_level_0,annotations,ephys,gene,layer,projection,region,shape,short_name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"H_lva_cortex:v2_GSM337817,v2_GSM337818,v2_GSM337819","[ABA_REGION:315, HBP_LAYER:0000050, NCBI_GENE:...",,Etv1,L5a,,Isocortex,ST Pyr,Isocortex L5a Etv1 ST Pyr
"H_lvb_cortex:v2_GSM337821,v2_GSM337822,v2_GSM337823","[ABA_REGION:315, HBP_LAYER:0000051, NCBI_GENE:...",,Colgalt2,L5b,,Isocortex,TT Pyr,Isocortex L5b Colgalt2 TT Pyr
"H_l6_cortex:v2_GSM337837,v2_GSM337838,v2_GSM337839","[ABA_REGION:315, HBP_LAYER:0000006, NCBI_GENE:...",,Ntsr1,L6,,Isocortex,Pyr,Isocortex L6 Ntsr1 Pyr
"H_cck+_cortex:v2_GSM337769,v2_GSM337770,v2_GSM337771","[ABA_REGION:315, NCBI_GENE:12424]",,Cck,,,Isocortex,,Isocortex Cck
"H_mn_brainstem:v2_GSM337773,v2_GSM337774,v2_GSM337775","[ABA_REGION:343, NCBI_GENE:12647, HBP_NEUROTRA...",,Chat,,,BS,,BS Chat ACh motor
"H_chol_basal_fb:v2_GSM337779,v2_GSM337780,v2_GSM337781","[UNKN_REGION:5, UNKN_REGION:4474, NCBI_GENE:12...",,Chat,,,basal forebrain,,basal forebrain Chat ACh
"H_chol_spinal:v2_GSM337785,v2_GSM337786,v2_GSM337787","[UNKN_REGION:4645, NCBI_GENE:12647, HBP_NEUROT...",,Chat,,,spinal cord,,spinal cord Chat ACh
"H_chol_corp_striatum:v2_GSM337791,v2_GSM337792,v2_GSM337793","[ABA_REGION:672, NCBI_GENE:12647, HBP_NEUROTRA...",,Chat,,,CP,,CP Chat ACh giant
"H_cort+_cortex:v2_GSM337804,v2_GSM337805,v2_GSM337806","[ABA_REGION:315, NCBI_GENE:12854]",,Cort,,,Isocortex,,Isocortex Cort
"H_drd1+_msn_striatum:v2_GSM337807,v2_GSM337808,v2_GSM337809","[ABA_REGION:672, NCBI_GENE:13488, Size:medium,...",,Drd1,,,CP,spiny,CP Drd1 medium spiny


In [29]:
gene_exp_table = gene_exp_table.join(df_gene)
gene_exp_table.to_csv('df_gene.csv', sep='\t', encoding='utf-8')

In [62]:
from similarity import similarity, similarity2
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
similarity('Periventricular zone dopaminergic neuron', 'Hypothalamic medial zone dopaminergic neuron')

(0.75, [[], (['ABA_REGION:1097'], 'sibling regions'), [], [], [], []])

In [55]:
similarity2(gene_dict_list[25]['annotations'], dict_list[25]['annotations'])

(4.75,
 [(['HBP_LAYER:0000005'], 'located on same layer'),
  (['ABA_REGION:315'], 'sibling regions'),
  ([u'UNKN_REGION:22'], 'shares general regions'),
  ([u'HBP_MORPHOLOGY:0000001'], 'shares morphology'),
  [],
  []])

In [39]:
print dict_list[25]['short_name']
print gene_dict_list[25]['short_name']

SSs L5b CSp TT large Pyr
MO L5 CSp Pyr


In [63]:
for g in gene_dict_list:
    curr_list = g
    sim_vec = []
    sim_val_vec = []
    for d in dict_list:
        sim = similarity2(d['annotations'], curr_list['annotations'])
        sim_vec.append(sim)
        sim_val_vec.append(sim[0])
    sim_val_vec
    inds = sorted(range(len(sim_val_vec)),key=lambda x:sim_val_vec[x], reverse = True)
    print 'target name -> %s' % curr_list['short_name']
    for i in range(0,10):
        print 'match value: %s, best match: %s' % (sim_val_vec[inds[i]], dict_list[inds[i]]['short_name'])
    print '\n'

target name -> Isocortex L5a Etv1 ST Pyr
match value: 3.5, best match: Isocortex L5 Pyr
match value: 3.5, best match: Isocortex L5 large Pyr
match value: 3.5, best match: Isocortex Pyr L5-6
match value: 3.5, best match: L5 Isocortex Pyr
match value: 3.5, best match: Isocortex L5 Pyr
match value: 3.5, best match: Isocortex Pyr L5-6
match value: 3.5, best match: Isocortex Pyr L5-6
match value: 3.125, best match: ORB L5 large Pyr
match value: 3.125, best match: VIS L5 R S tall Pyr
match value: 3.125, best match: VIS L5 B tall Pyr


target name -> Isocortex L5b Colgalt2 TT Pyr
match value: 4.125, best match: SSs L5b CSp TT large Pyr
match value: 4.125, best match: MOp L5b CSp TT large Pyr
match value: 4.125, best match: MOp L5b CSp TT large Pyr
match value: 4.125, best match: MOp L5b CSp TT large Pyr
match value: 4.125, best match: L5 VIS large TT Pyr
match value: 4.125, best match: SS L5 TT Pyr
match value: 3.5, best match: Isocortex L5 Pyr
match value: 3.5, best match: Isocortex L5 large

In [135]:
reload(similarity)

TypeError: reload() argument must be module

In [None]:
print 'target name -> %s' % curr_list['short_name']
for i in range(1,50):
    print dict_list[inds[i]]['short_name']