In [507]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [508]:
from similarity import similarity
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
similarity('Periventricular zone layer 5a neuron', 'Hypothalamic medial zone layer 5-6 neuron')

(0.6987179487179488,
 [(['HBP_LAYER:0000005'], 'shares layers'),
  (['ABA_REGION:1097'], 'sibling regions'),
  [],
  [],
  [],
  [],
  [],
  []])

In [509]:
import pandas as pd


# download neuroelectro summary spreadsheet from URL and import into pandas data frame
neuroelectro_dataset_url = "http://dev.neuroelectro.org/static/src/article_ephys_metadata_curated.csv"
table = pd.read_csv(neuroelectro_dataset_url, sep='\t', index_col = 0, header=0)

In [510]:
big_onto = load_ontologies()

In [511]:
name_spaces = [('NCBI_GENE', 'gene'), ('HBP_LAYER', 'layer'), ('HBP_MORPHOLOGY', 'shape'), ('REGION', 'region'),
              ('HBP_EPHYS', 'ephys'), ('MOUSE_LINE', 'line'),]
proj_name = 'projection'

proj_list = ['CStr', 'CSp', 'CThal', 'CCo', 'CTrig', 'CTect']

def annots_to_dict(annot_list):
    temp_dict = {}
    for annotation in annot_list:
        for s in name_spaces:
            if s[0] in annotation:
                term = _normalize([annotation], shorten = True)
                if term in proj_list:
                    temp_dict[proj_name] = term
                    continue
                if s[1] not in temp_dict:
                    temp_dict[s[1]] = term
                else:
                    temp_dict[s[1]] = '%s %s' %  (temp_dict[s[1]],  term)
    return temp_dict

In [512]:
strain_list = ['GIN', 'G42', 'GAD65-GFP', 'Thy1-YFP']
def add_animal_strain_to_neuron(input_neuron_name, strain):
    output_neuron_name = input_neuron_name

    if strain in strain_list:
        #output_neuron_name += ' from %s mouse line' % strain
        output_neuron_name += ' %s' % strain
    return output_neuron_name
def check_strain(neuron_name):
    for s in strain_list:
        if s in neuron_name:
            return 'MOUSE_LINE:%s' % s
    return None
check_strain('Hypothalamic medial zone dopaminergic neuron from Thy1-YFP mouse line')

'MOUSE_LINE:Thy1-YFP'

In [513]:
from sherlok import Sherlok
s = Sherlok('neuroner')

from similarity import _cleanup, _normalize

neuron_name = 'Neocortex corticostriatal NPY somatostatin-expressing with depolarizing fast spiking AHP Layer 5a pyramidal cell GIN'
r = s.annotate(neuron_name)
#annot_list = format_annots(neuron_name, r.annotations)
al = _cleanup(r.annotations, neuron_name)
if check_strain(neuron_name):
    al.append(check_strain(neuron_name))
print _normalize(al, shorten = True)

#print annot_list
print al

Isocortex CStr Npy Sst with depolarizing F S AHP L5a Pyr GIN GIN
[u'ABA_REGION:315', 'HBP_PROJECTION:7322', u'NCBI_GENE:109648', u'NCBI_GENE:20604', 'Missing:with depolarizing', u'HBP_EPHYS:0000080', u'HBP_EPHYS_TRIGGER:0000003', 'Missing:AHP', u'HBP_LAYER:0000050', u'HBP_MORPHOLOGY:0000001', 'Missing:GIN', 'MOUSE_LINE:GIN']


In [514]:
# use sherlok to annotate each curated neuron mention in neuroelectro
annot_vec = []
dict_list = []
for index, row in table.iterrows():
    neuron_name_orig = row['NeuronLongName']
    strain = unicode(row['Strain'])
    # if no neuron long name, use neurolex neuron name (should always be available)
    if pd.isnull(neuron_name_orig):
        neuron_name_orig = row['NeuronName']
        
    # hack for adding strain annotations, since not in neuroNER
    neuron_name = add_animal_strain_to_neuron(neuron_name_orig, strain)
    
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    
    if check_strain(neuron_name):
        cleaned_annotations.append(check_strain(neuron_name))
        for c in cleaned_annotations:
            if c.startswith('NCBI_GENE'):
                cleaned_annotations.remove(c)
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['neuron_name'] = neuron_name
    temp_dict['neuron_name_orig'] = neuron_name_orig
    temp_dict['short_name'] = short_annots
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [431]:
dict_list[4]

{'Index': 1093,
 'annotations': [u'UNKN_REGION:3442',
  'HBP_PROJECTION:22',
  u'HBP_MORPHOLOGY:0000001'],
 'neuron_name': 'secondary motor cortex corticospinal pyramidal cells',
 'neuron_name_orig': 'secondary motor cortex corticospinal pyramidal cells',
 'region': 'secondary motor cortex',
 'shape': 'Pyr',
 'short_name': 'secondary motor cortex CSp Pyr'}

In [515]:
col_names = [n[1] for n in name_spaces]
col_names.append('projection')

df = pd.DataFrame.from_records(dict_list, index = 'Index')
df

table[0:4]

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,ADP amplitude,access resistance,spike decay time,fast AHP duration,cell diameter,slow AHP duration,medium AHP amplitude,cell surface area,ADP duration,medium AHP duration
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1011,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1012,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,,,,,,,,,,
1018,Rac1 and rac3 GTPases control synergistically ...,23258346,2014,de Curtis I,88539,36200,Hippocampus CA3 pyramidal cell,,Hippocampal formation,True,...,,,,,,,,,,


In [516]:
table = table.join(df)

In [66]:
table.to_csv('df.csv', sep='\t', encoding='utf-8')

In [518]:
table.head()

Unnamed: 0_level_0,Title,Pmid,PubYear,LastAuthor,ArticleID,TableID,NeuronName,NeuronLongName,BrainRegion,MetadataCurated,...,annotations,ephys,gene,layer,line,neuron_name,neuron_name_orig,region,shape,short_name
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1010,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1011,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1012,GluN2D-containing NMDA receptors-mediate synap...,25859181,2015,Monyer H,85644,27502,Other,Hippocampus GluN2D-EGFP-positive interneuron,,True,...,"[ABA_REGION:1080, HBP_NEUROTRANSMITTER:0000004...",,,,,Hippocampus GluN2D-EGFP-positive interneuron,Hippocampus GluN2D-EGFP-positive interneuron,HIP,,HIP glu N2D-EGFP- positive
1018,Rac1 and rac3 GTPases control synergistically ...,23258346,2014,de Curtis I,88539,36200,Hippocampus CA3 pyramidal cell,,Hippocampal formation,True,...,"[ABA_REGION:463, HBP_MORPHOLOGY:0000001]",,,,,Hippocampus CA3 pyramidal cell,Hippocampus CA3 pyramidal cell,CA3,Pyr,CA3 Pyr
1093,Multiple layer 5 pyramidal cell subtypes relay...,23551921,2014,Kawaguchi Y,88453,35997,Neocortex pyramidal cell layer 5-6,secondary motor cortex corticospinal pyramidal...,Isocortex,True,...,"[UNKN_REGION:3442, HBP_PROJECTION:22, HBP_MORP...",,,,,secondary motor cortex corticospinal pyramidal...,secondary motor cortex corticospinal pyramidal...,secondary motor cortex,Pyr,secondary motor cortex CSp Pyr


In [520]:
# load in csv of Design.csv dataset

gene_exp_path = '/home/stripathy/brainCellTypeSpecificGenes/data/Design.tsv'
gene_exp_table = pd.read_csv(gene_exp_path, sep='\t', index_col = 0, header=0)


In [522]:
gene_exp_table.tail()

Unnamed: 0_level_0,MajorType,JustPyra,GabaDeep,PyramidalDeep,Neurotransmitter1,ShinyNames,ShinySubtypes,CellType,forContanim,Normalize2.0,...,Method,RNA.isolation.method,RNA.amplification.and.labeling.method,Platform,Reference,PMID,SharePermit,Notes,MouseStrain,ReporterGene
Cell.Type..Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"GSM1698225,GSM1698226,GSM1698227",Neuron,,,,,,,,Pyramidal,T,...,TRAP,,,GPL1261,Shrestha et al. 2015,26371510,T,shreejoy wanted them,,
"GSM1698228,GSM16982289,GSM1698230",Neuron,,,,,,,,Pyramidal,T,...,TRAP,,,GPL1261,Shrestha et al. 2015,26371510,T,shreejoy wanted them,,
"GSM1698231,GSM1698232,GSM1698233",Neuron,,,,,,,,Gabaergic,T,...,TRAP,,,GPL1261,Shrestha et al. 2015,26371510,T,shreejoy wanted them,,
"GSM1698237,GSM1698238,GSM1698239",Glia,,,,,,,,Oligo,T,...,TRAP,,,GPL1261,Shrestha et al. 2015,26371510,T,shreejoy wanted them,,
"GSM1698234,GSM1698235,GSM1698236",Glia,,,,,,,,Astrocyte,T,...,TRAP,,,GPL1261,Shrestha et al. 2015,26371510,T,shreejoy wanted them,,


In [523]:
import re

# use sherlok to annotate each curated neuron mention in neuroelectro
#annot_vec = []
gene_dict_list = []
for index, row in gene_exp_table.iterrows():
    neuron_name = row['neuroNER Inferred Properties']
    
    # if no neuron long name, just move on
    if pd.isnull(neuron_name):
        temp_dict = {}
        temp_dict['annotations'] = []
        temp_dict['Index'] = index
        temp_dict['short_name'] = ''
        temp_dict['neuron_name'] = ''
        gene_dict_list.append(temp_dict)
        continue
        # neuron_name = row['NeuronName']
    
    mouse_line_match = re.search('from .+ mouse line', neuron_name)
    if mouse_line_match:
        strain_annotation = check_strain(neuron_name)
        neuron_name = re.sub('from .+ mouse line', '', neuron_name)
    
    # annotate using sherlok
    r = s.annotate(neuron_name)
    cleaned_annotations = _cleanup(r.annotations, neuron_name)
    if mouse_line_match:
        cleaned_annotations.append(strain_annotation)
        for c in cleaned_annotations:
            if c.startswith('NCBI_GENE'):
                cleaned_annotations.remove(c)
    short_annots = _normalize(cleaned_annotations, shorten = True)
    temp_dict = annots_to_dict(cleaned_annotations)
    temp_dict['short_name'] = short_annots
    temp_dict['neuron_name'] = neuron_name
    temp_dict['annotations'] = cleaned_annotations
    temp_dict['Index'] = index
    gene_dict_list.append(temp_dict)

#     f = format_annots(neuron_name, r.annotations)
#     annot_vec.append((neuron_name, f))

In [524]:
df_gene = pd.DataFrame.from_records(gene_dict_list, index = 'Index')
gene_exp_table = gene_exp_table.join(df_gene)

In [29]:
gene_exp_table.to_csv('df_gene.csv', sep='\t', encoding='utf-8')

In [525]:
for index, row in gene_exp_table.iterrows():
    curr_list = row
    sim_vec = []
    sim_val_vec = []
    for d in dict_list:
        sim = similarity2(curr_list['annotations'], d['annotations'], symmetric = False, use_inter_similarity = False, )
        sim_vec.append(sim)
        sim_val_vec.append(sim[0])
    sim_val_vec
    inds = sorted(range(len(sim_val_vec)),key=lambda x:sim_val_vec[x], reverse = True)
    print 'target name -> %s' % curr_list['short_name']
    #print curr_list['annotations']
    for i in range(0,20):
        print 'match value: %.2f, best match: %s' % (sim_val_vec[inds[i]], dict_list[inds[i]]['short_name'])
        #print dict_list[inds[i]]['annotations']
    print '\n'

target name -> Isocortex L5a Etv1 ST Pyr
match value: 0.68, best match: SSp-bfd Pyr L5a
match value: 0.68, best match: L5a SS barrel column small tuft Pyr
match value: 0.68, best match: L5a somatosensory SSp-bfd Pyr
match value: 0.68, best match: L5a somatosensory SSp-bfd Pyr
match value: 0.68, best match: L5a somatosensory SSp-bfd Pyr
match value: 0.68, best match: primary somatosensory SSp-bfd L5a Pyr
match value: 0.68, best match: somatosensory SSp-bfd L5a Pyr
match value: 0.65, best match: Isocortex L5 Pyr
match value: 0.65, best match: Isocortex L5 large Pyr
match value: 0.65, best match: L5 Isocortex Pyr
match value: 0.65, best match: Isocortex L5 Pyr
match value: 0.62, best match: ORB L5 large Pyr
match value: 0.62, best match: VIS L5 R S tall Pyr
match value: 0.62, best match: VIS L5 B tall Pyr
match value: 0.62, best match: VIS L5 R S short Pyr
match value: 0.62, best match: MOp L5 CThal Pyr
match value: 0.62, best match: MOp L5 CSp Pyr
match value: 0.62, best match: MOp L5 CC

In [526]:
from similarity import similarity, similarity2
from similarity_intra import load_ontologies

# Computes the intra and inter semantic similarity between two neurons
# in: n1@str, n2@str: the two neurons to measure similarity
# out: (score:float, [(matching_properties, explanation@str)])
#similarity('PV Layer 1 cortex acetylcholine neuron', 'Layer 1a striatum gaba fast-spiking neuron')
print similarity('dorsal raphe serotonergic cell', 'Midbrain Slc6a4-expressing serotonergic cells', symmetric = True)
print similarity('Midbrain Slc6a4-expressing serotonergic cells', 'dorsal raphe serotonergic cell', symmetric = False)

(0.7708333333333334, [[], (['ABA_REGION:313'], 'sharing a common brain region'), [], [], [], ([u'HBP_NEUROTRANSMITTER:0000001'], 'shares neurotransmitters'), [], []])
(0.6607142857142857, [[], (['ABA_REGION:313'], 'sharing a common brain region'), [], [], [], ([u'HBP_NEUROTRANSMITTER:0000001'], 'shares neurotransmitters'), [], []])


In [137]:
similarity2(gene_dict_list[25]['annotations'], dict_list[25]['annotations'])

(0.0, [[], [], [], [], [], [], [], []])

In [161]:
print dict_list[25]['short_name']
print gene_exp_table.iloc[51]['annotations']

SSs L5b CSp TT large Pyr
[u'ABA_REGION:453', u'NCBI_GENE:19293', u'HBP_EPHYS:0000080', u'HBP_EPHYS_TRIGGER:0000003', u'HBP_MORPHOLOGY:0000019', 'Missing:from G42', 'Species:mouse', 'Missing:line', 'MOUSE_LINE:G42']


In [527]:
from  more_itertools import unique_everseen

def find_matching_neurons(target_neuron_anns, matching_ann_dict_list, match_thresh = .8):
    sim_vec = []
    sim_val_vec = []
    matching_neuron_list = []
    for d in matching_ann_dict_list:
        sim = similarity2(target_neuron_anns['annotations'], d['annotations'], symmetric = False, use_inter_similarity = False)
        sim_vec.append(sim)
        sim_val_vec.append(sim[0])
    sim_val_vec
    inds = sorted(range(len(sim_val_vec)),key=lambda x:sim_val_vec[x], reverse = True)
    for i in range(0, len(inds)):
        if sim_val_vec[inds[i]] > match_thresh:
            matching_neuron_list.append(dict_list[inds[i]]['neuron_name_orig'])
    if (len(matching_neuron_list) <= 2):
        #print match_thresh, target_neuron_anns
        return find_matching_neurons(target_neuron_anns, matching_ann_dict_list, match_thresh - .05)
    elif match_thresh < .4:
        return []
    return list(unique_everseen(matching_neuron_list))

In [528]:
ind = 63
#print gene_exp_table.iloc[ind]['neuron_name']
find_matching_neurons(gene_exp_table.iloc[ind], dict_list)

['Layer 5 sensorimotor cortex pyramidal neurons',
 'Primary motor cortex layer 5 corticothalamic pyramidal neuron',
 'Primary motor cortex layer 5 corticospinal pyramidal neuron',
 'Primary motor cortex layer 5 corticocortical pyramidal neuron',
 'Primary motor cortex layer 5 corticostriatal pyramidal neuron',
 'Neocortex Layer V Pyramidal Cell',
 'neocortex layer 5 large pyramidal cell',
 'Layer 5 neocortex pyramidal neuron',
 'Neocortex layer 5 pyramidal neuron',
 'Prefrontal cortex layer 5 large pyramidal neuron',
 'visual cortex layer 5 regular spiking tall pyramidal neuron',
 'visual cortex layer 5 bursting tall pyramidal neuron',
 'visual cortex layer 5 regular spiking short pyramidal neuron',
 'Layer 5 frontal pyramidal cells projecting perirhinal cortex  ',
 'Regular spiking, Thy1-expressing layer V pyramidal neurons of somatosensory cortex ',
 'Somatosensory cortex layer 5 apical pyramidal cell',
 'prefrontal cortex layer 5 pyramidal neuron ',
 'Layer 5 Visual Cortex large thi

In [529]:
import numpy as np
# use sherlok to annotate each curated neuron mention in neuroelectro
#annot_vec = []
gene_dict_list = []
for index, row in gene_exp_table.iterrows():
    temp_dict = {}
    if len(row['annotations']) > 0:
        matched_neuron_list = find_matching_neurons(row, dict_list)
    else:
        matched_neuron_list = []
    temp_dict['matched_neuron_list'] = ';'.join(matched_neuron_list)
    temp_dict['Index'] = index
    gene_dict_list.append(temp_dict)

In [530]:
df_gene = pd.DataFrame.from_records(gene_dict_list, index = 'Index')
gene_exp_table_final = gene_exp_table.join(df_gene)

In [531]:
gene_exp_table_final.head()

Unnamed: 0_level_0,MajorType,JustPyra,GabaDeep,PyramidalDeep,Neurotransmitter1,ShinyNames,ShinySubtypes,CellType,forContanim,Normalize2.0,...,annotations,ephys,gene,layer,line,neuron_name,region,shape,short_name,matched_neuron_list
Cell.Type..Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"H_lva_cortex:v2_GSM337817,v2_GSM337818,v2_GSM337819",,,,,,,,,Pyramidal,T,...,"[ABA_REGION:315, HBP_LAYER:0000050, NCBI_GENE:...",,Etv1,L5a,,Neocortex layer 5a Etv1-expressing slender-tuf...,Isocortex,ST Pyr,Isocortex L5a Etv1 ST Pyr,barrel cortex pyramidal neurons layer 5a;Layer...
"H_lvb_cortex:v2_GSM337821,v2_GSM337822,v2_GSM337823",,,,,,,,,Pyramidal,T,...,"[ABA_REGION:315, HBP_LAYER:0000051, NCBI_GENE:...",,Colgalt2,L5b,,Neocortex layer 5b Glt25d2-expressing thick-tu...,Isocortex,TT Pyr,Isocortex L5b Colgalt2 TT Pyr,Secondary somatosensory cortex layer 5b cortic...
"H_l6_cortex:v2_GSM337837,v2_GSM337838,v2_GSM337839",,,,,,,,,Pyramidal,T,...,"[ABA_REGION:315, HBP_LAYER:0000006, NCBI_GENE:...",,Ntsr1,L6,,Neocortex layer 6 Ntsr1-expressing pyramidal c...,Isocortex,Pyr,Isocortex L6 Ntsr1 Pyr,barrel cortex corticothalamic pyramidal neuron...
"H_cck+_cortex:v2_GSM337769,v2_GSM337770,v2_GSM337771",Neuron,,,,,,,,,F,...,"[ABA_REGION:315, NCBI_GENE:12424]",,Cck,,,Neocortex Cck-expressing cell,Isocortex,,Isocortex Cck,frontal cortex Layer 2/3 non-pyramidal cholecy...
"H_mn_brainstem:v2_GSM337773,v2_GSM337774,v2_GSM337775",Neuron,,MotorCholin,MotorCholin,Cholinergic,MotorCholin,,MotorCholin,MotorCholin,T,...,"[ABA_REGION:343, NCBI_GENE:12647, HBP_NEUROTRA...",,Chat,,,Brainstem Chat-expressing cholinergic motor ne...,BS,,BS Chat ACh motor,


In [532]:
# work on sorting gene exp rows by semantic similarity

from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import leaders, fclusterdata, linkage, dendrogram, leaves_list

# calculate pairwise distance matrix based on similarity
dist_mat = []
labels = []
for index, row1 in gene_exp_table_final.iterrows():
    sim_vec = []
    ann1 = row1['annotations']
    labels.append(row1['short_name'])
    for i, row2 in gene_exp_table.iterrows():
        ann2 = row2['annotations']
        
        (sim_val, exp) = similarity2(ann1, ann2, symmetric = True, use_inter_similarity = False)
        sim_dist = 1 - sim_val
        sim_vec.append(sim_dist)
    dist_mat.append(sim_vec)
    
# 
d = squareform(dist_mat)
Z = linkage(d)
sorted_rows = leaves_list(Z)

In [533]:
gene_exp_table_final = gene_exp_table_final.iloc[sorted_rows]
gene_exp_table_final.to_csv('df_gene.csv', sep='\t', encoding='utf-8')

In [352]:
dendrogram(Z, labels = labels)


array([ 82,  15,  16,  17,  18,  24,  25,  26,  27,  28,  29,  30,  31,
        56,  57,  58,  59,  60,  61,  70,  72,  79,  80,  81,  83,  84,
        85,  86,  87,  88,  89,  90,  91, 104, 105, 111, 119, 120, 121,
       122,  50,  49,  55, 124,  51,  52,  53,  54, 106, 107, 108, 109,
       110,  69,  92,  93,  94,  95,  96,  97,  98,  13,  11,  12,  14,
        21,  22,  62,  19,  99, 100, 101, 126,  47,  48,  71,  42,  43,
        44,  45,  46,   0, 102,  38, 118,  35,  36,  37, 115, 116, 117,
        32,  33,  34,   1,  78,  40,  41,  67, 123,  63,  64,   3,   8,
        20, 113,  23, 114,   2,  39,  10,  76,  77,   9,  73,  74,  75,
       103, 128, 129,  65,  66,  68, 125, 127, 112,   4,   7,   5,   6], dtype=int32)