In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
from itertools import chain
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests

In [2]:
pd.options.display.max_colwidth = 80

In [3]:
groups = pd.read_csv("https://metamap.nlm.nih.gov/Docs/SemGroups_2013.txt", sep="|",
                    names=['abv', 'group', 'id', 'label'])
groups.head()

Unnamed: 0,abv,group,id,label
0,ACTI,Activities & Behaviors,T052,Activity
1,ACTI,Activities & Behaviors,T053,Behavior
2,ACTI,Activities & Behaviors,T056,Daily or Recreational Activity
3,ACTI,Activities & Behaviors,T051,Event
4,ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity


In [4]:
print(sorted(list(groups.query("group == 'Disorders'").label)))

['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom']


In [5]:
names = pd.read_csv("https://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt", sep="|", 
                   names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [6]:
type_label = dict(zip(names.ID, names.label))

In [7]:
df = pd.read_csv("MRSTY.RRF", sep="|", names=['ID', 'TYPE', 'who', 'the', 'fuck', 'knows'], index_col=False)

In [8]:
id_type = df.groupby("ID").TYPE.apply(set)

In [9]:
id_type = id_type.to_dict()
id_type['C0000005']

{'T116', 'T121', 'T130'}

In [10]:
id_type_label = {k:{type_label.get(x) for x in v} for k,v in id_type.items()}
id_type_label['C0000005']

{'Amino Acid, Peptide, or Protein',
 'Indicator, Reagent, or Diagnostic Aid',
 'Pharmacologic Substance'}

In [11]:
nodes = pd.read_csv("nodes_sanitized.csv", index_col=0)
del nodes['TYPE']
nodes.head()

Unnamed: 0,ID,label,umls_type,umls_type_label
0,C0038195,State Medicine,{'T064'},{'Governmental or Regulatory Activity'}
1,C1290952,Taking medication,{'T056'},{'Daily or Recreational Activity'}
2,C0680552,legalization,{'T064'},{'Governmental or Regulatory Activity'}
3,C0085092,Parenting behavior,{'T054'},{'Social Behavior'}
4,C1096771,Murderer,{'T055'},{'Individual Behavior'}


In [12]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x} if x else None)

In [13]:
nodes.head()

Unnamed: 0,ID,label,umls_type,umls_type_label
0,C0038195,State Medicine,{T064},{Governmental or Regulatory Activity}
1,C1290952,Taking medication,{T056},{Daily or Recreational Activity}
2,C0680552,legalization,{T064},{Governmental or Regulatory Activity}
3,C0085092,Parenting behavior,{T054},{Social Behavior}
4,C1096771,Murderer,{T055},{Individual Behavior}


In [14]:
# toss those with no umls types
print(len(nodes))
nodes.dropna(subset=['umls_type'], inplace=True)
print(len(nodes))

268917
259144


In [15]:
blm_to_umls = {
    'cell': {'cell'},
    'cell_component': {'cell component'},
    'gross_anatomical_structure': {'Body System', 'Embryonic Structure', 'Fully Formed Anatomical Structure',
                                  'Tissue', 'Body Part, Organ, or Organ Component'},
    'anatomical_entity': {'Body Location or Region', 'Body Space or Junction', 'Body Substance'},
    'activity_and_behavior': {'Activity', 'Behavior', 'Daily or Recreational Activity' , 'Individual Behavior',
                             'Social Behavior', 'Occupational Activity'},
    'protein': {'Amino Acid, Peptide, or Protein', 'Enzyme'},
    'genomic_entity': {'Nucleic Acid, Nucleoside, or Nucleotide'},
    # this is everything in 'Chemicals & Drugs' minus 'Amino Acid, Peptide, or Protein', 
    # 'Enzyme', 'Nucleic Acid, Nucleoside, or Nucleotide'
    'chemical_substance': {'Chemical Viewed Structurally', 'Organic Chemical', 'Carbohydrate', 
                           'Immunologic Factor', 'Lipid', 'Element, Ion, or Isotope', 'Organophosphorus Compound', 
                           'Biologically Active Substance', 'Pharmacologic Substance', 'Hormone', 
                           'Eicosanoid', 'Vitamin', 'Chemical Viewed Functionally', 'Hazardous or Poisonous Substance', 
                           'Inorganic Chemical', 'Clinical Drug', 'Neuroreactive Substance or Biogenic Amine', 
                           'Biomedical or Dental Material', 'Antibiotic', 'Chemical', 'Steroid', 
                           'Indicator, Reagent, or Diagnostic Aid', 'Receptor'},
    # this is everything under 'Disorders' umls semantic group
    'disease_or_phenotypic_feature': {'Acquired Abnormality', 'Anatomical Abnormality', 
                                      'Cell or Molecular Dysfunction', 'Congenital Abnormality', 
                                      'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 
                                      'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 
                                      'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom'},
    'gene': {'Gene or Genome'},
    'genomic_entity': {'Amino Acid Sequence', 'Nucleotide Sequence'},
    'biological_process_or_activity': {'Biologic Function', 'Molecular Function', 'Genetic Function',
                                      'Organism Function', 'Organ or Tissue Function', 'Cell Function',
                                      'Mental Process'},
    'phenotypic_feature': {'Laboratory or Test Result'},
    'individual_organism': {'Living Beings'},
    'named_thing': {'Objects'}
}
umls_to_blm_check = defaultdict(set)
umls_to_blm = dict()
for k,vv in blm_to_umls.items():
    for v in vv:
        umls_to_blm_check[v.lower()].add(k.lower())
        umls_to_blm[v.lower()] = k.lower()
assert set(len(x) for x in umls_to_blm_check.values()) == {1}
# umls_to_blm

In [16]:
nodes['blm_type'] = nodes.umls_type_label.map(lambda x: {umls_to_blm.get(xx.lower()) for xx in x})
nodes.blm_type = nodes.blm_type.map(lambda v: {x for x in v if x})
nodes.blm_type = nodes.blm_type.map(lambda v: v if v else pd.np.NAN)
nodes.dropna(subset=['blm_type'], inplace=True)
nodes.head()

Unnamed: 0,ID,label,umls_type,umls_type_label,blm_type
1,C1290952,Taking medication,{T056},{Daily or Recreational Activity},{activity_and_behavior}
3,C0085092,Parenting behavior,{T054},{Social Behavior},{activity_and_behavior}
4,C1096771,Murderer,{T055},{Individual Behavior},{activity_and_behavior}
6,C0006875,Cannibalism,{T054},{Social Behavior},{activity_and_behavior}
7,C0871454,Study Habits,{T055},{Individual Behavior},{activity_and_behavior}


In [17]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,label,umls_type,umls_type_label,blm_type
6,C0006875,Cannibalism,{T054},{Social Behavior},{activity_and_behavior}
22299,C0879593,autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}","{chemical_substance, cell}"
34904,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}","{chemical_substance, protein}"


In [18]:
# the only nodes with two types:
nodes[nodes.blm_type.map(len)>1].blm_type.map(frozenset).value_counts()

(chemical_substance, protein)           16117
(chemical_substance, cell)                  3
(chemical_substance, genomic_entity)        1
Name: blm_type, dtype: int64

In [19]:
# within the nodes with more than 1 blm types:
# what are the most common umls types
nodes2 = nodes[nodes.blm_type.map(len)>1]
Counter(nodes2[nodes2.umls_type_label.map(len)>1].umls_type_label.map(frozenset)).most_common(10)

[(frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance'}),
  7375),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance'}),
  2568),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Immunologic Factor'}), 2156),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Receptor'}), 1367),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Immunologic Factor',
             'Pharmacologic Substance'}),
  536),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Enzyme',
             'Pharmacologic Substance'}),
  269),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Hormone',
             'Pharmacologic Substance'}),
  264),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance',
             'Hazardous or Poisonous Substance'}),
  254),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Antibiotic'}), 216),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Hormone'}), 1

In [20]:
nodes.loc[nodes.blm_type == {'protein', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.loc[nodes.blm_type == {'cell', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.loc[nodes.blm_type == {'genomic_entity', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.blm_type = nodes.blm_type.map(lambda x:list(x)[0])

In [21]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,label,umls_type,umls_type_label,blm_type
6,C0006875,Cannibalism,{T054},{Social Behavior},activity_and_behavior
22299,C0879593,autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}",biological_entity
34904,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}",biological_entity


In [22]:
nodes[nodes.blm_type=="disease_or_phenotypic_feature"].umls_type_label.map(frozenset).value_counts()

(Disease or Syndrome)                                  16445
(Finding)                                               8702
(Neoplastic Process)                                    5024
(Pathologic Function)                                   2423
(Injury or Poisoning)                                   2104
(Sign or Symptom)                                       1979
(Congenital Abnormality)                                1720
(Mental or Behavioral Dysfunction)                      1408
(Acquired Abnormality)                                   658
(Anatomical Abnormality)                                 612
(Cell or Molecular Dysfunction)                          410
(Congenital Abnormality, Disease or Syndrome)            124
(Experimental Model of Disease)                           36
(Neoplastic Process, Experimental Model of Disease)       25
(Anatomical Abnormality, Disease or Syndrome)             13
Name: umls_type_label, dtype: int64

In [23]:
nodes.blm_type.value_counts()

chemical_substance                65061
disease_or_phenotypic_feature     41683
gene                              18932
biological_entity                 16121
protein                           13813
gross_anatomical_structure         9260
biological_process_or_activity     7446
anatomical_entity                  2896
cell_component                     1685
activity_and_behavior              1281
cell                               1241
phenotypic_feature                  550
genomic_entity                      182
Name: blm_type, dtype: int64

In [24]:
nodes.umls_type_label = nodes.umls_type_label.apply("|".join)
nodes.umls_type = nodes.umls_type.apply("|".join)

In [25]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875", "C0815111"})]

Unnamed: 0,ID,label,umls_type,umls_type_label,blm_type
6,C0006875,Cannibalism,T054,Social Behavior,activity_and_behavior
22299,C0879593,autologous dendritic cells,T121|T025,Cell|Pharmacologic Substance,biological_entity
34904,C0079904,NF-kappa B,T116|T129,"Immunologic Factor|Amino Acid, Peptide, or Protein",biological_entity


In [27]:
nodes.to_csv("nodes_blm.csv")