In [1]:
# the types in this file are bad.
# get types from the UMLS mapping file: MRSTY.RRF
# and type->names from: 

In [51]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
from itertools import chain
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests

In [19]:
names = pd.read_csv("https://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt", sep="|", 
                   names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [40]:
type_label = dict(zip(names.ID, names.label))

In [13]:
df = pd.read_csv("MRSTY.RRF", sep="|", names=['ID', 'TYPE', 'who', 'the', 'fuck', 'knows'], index_col=False)

In [30]:
id_type = df.groupby("ID").TYPE.apply(set)

In [34]:
id_type = id_type.to_dict()
id_type['C0000005']

{'T116', 'T121', 'T130'}

In [37]:
nodes = pd.read_csv("nodes_sanitized.csv", index_col=0)
nodes.head()

Unnamed: 0,ID,label,TYPE
0,C0038195,State Medicine,Activities & Behaviors
1,C1290952,Taking medication,Activities & Behaviors
2,C0680552,legalization,Activities & Behaviors
3,C0085092,Parenting behavior,Activities & Behaviors
4,C1096771,Murderer,Activities & Behaviors


In [42]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x, {}))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x})

In [44]:
nodes.head()

Unnamed: 0,ID,label,TYPE,umls_type,umls_type_label
0,C0038195,State Medicine,Activities & Behaviors,{T064},{Governmental or Regulatory Activity}
1,C1290952,Taking medication,Activities & Behaviors,{T056},{Daily or Recreational Activity}
2,C0680552,legalization,Activities & Behaviors,{T064},{Governmental or Regulatory Activity}
3,C0085092,Parenting behavior,Activities & Behaviors,{T054},{Social Behavior}
4,C1096771,Murderer,Activities & Behaviors,{T055},{Individual Behavior}


In [46]:
nodes.query("ID == 'C0079904'")

Unnamed: 0,ID,label,TYPE,umls_type,umls_type_label
34904,C0079904,NF-kappa B,Chemicals & Drugs,"{T129, T116}","{Amino Acid, Peptide, or Protein, Immunologic ..."


In [49]:
nodes.to_csv("nodes_sanitized.csv")

In [91]:
nodes = pd.read_csv("nodes_sanitized.csv", index_col=0, converters={'umls_type': eval, 'umls_type_label': eval})
nodes.head()

Unnamed: 0,ID,label,TYPE,umls_type,umls_type_label
0,C0038195,State Medicine,Activities & Behaviors,{T064},{Governmental or Regulatory Activity}
1,C1290952,Taking medication,Activities & Behaviors,{T056},{Daily or Recreational Activity}
2,C0680552,legalization,Activities & Behaviors,{T064},{Governmental or Regulatory Activity}
3,C0085092,Parenting behavior,Activities & Behaviors,{T054},{Social Behavior}
4,C1096771,Murderer,Activities & Behaviors,{T055},{Individual Behavior}


In [92]:
len(nodes)

268917

In [93]:
# number of nodes with no UMLS type
len(nodes[nodes.umls_type.map(len) == 0])

9773

In [94]:
Counter(chain(*list(nodes.umls_type_label))).most_common(30)

[('Organic Chemical', 53109),
 ('Pharmacologic Substance', 36750),
 ('Amino Acid, Peptide, or Protein', 29930),
 ('Gene or Genome', 18932),
 ('Disease or Syndrome', 16582),
 ('Eukaryote', 11772),
 ('Plant', 11265),
 ('Biologically Active Substance', 11195),
 ('Enzyme', 8922),
 ('Finding', 8702),
 ('Therapeutic or Preventive Procedure', 8114),
 ('Body Part, Organ, or Organ Component', 8114),
 ('Bacterium', 6016),
 ('Neoplastic Process', 5049),
 ('Medical Device', 4768),
 ('Immunologic Factor', 4384),
 ('Fungus', 3503),
 ('Laboratory Procedure', 3494),
 ('Molecular Function', 3260),
 ('Indicator, Reagent, or Diagnostic Aid', 3062),
 ('Mammal', 3056),
 ('Fish', 2888),
 ('Nucleic Acid, Nucleoside, or Nucleotide', 2804),
 ('Virus', 2523),
 ('Clinical Drug', 2498),
 ('Pathologic Function', 2423),
 ('Diagnostic Procedure', 2382),
 ('Hazardous or Poisonous Substance', 2288),
 ('Injury or Poisoning', 2104),
 ('Inorganic Chemical', 2084)]

In [95]:
# what is the TYPE for those where the umls type is: 
nodes[nodes.umls_type_label.apply(lambda x:"Amino Acid, Peptide, or Protein" in x)].TYPE.value_counts()

Chemicals & Drugs    29930
Name: TYPE, dtype: int64

In [96]:
# what is the TYPE for those where the umls type is: 
nodes[nodes.umls_type_label.apply(lambda x:"Nucleic Acid, Nucleoside, or Nucleotide" in x)].TYPE.value_counts()

Chemicals & Drugs    2804
Name: TYPE, dtype: int64

In [97]:
# "Amino Acid, Peptide, or Protein" nodes -> "protein"
nodes.loc[nodes.umls_type_label.apply(lambda x:"Amino Acid, Peptide, or Protein" in x), "TYPE"] = "protein"

In [98]:
# those that have umls type "Nucleic Acid, Nucleoside, or Nucleotide" and NOT "Pharmacologic Substance"
# -> "genomic_entity"
nodes.loc[nodes.umls_type_label.apply(lambda x:"Nucleic Acid, Nucleoside, or Nucleotide" in x and
                                     "Pharmacologic Substance" not in x), "TYPE"] = "genomic_entity"

In [100]:
nodes[nodes.umls_type_label.apply(lambda x:"Amino Acid, Peptide, or Protein" in x)].TYPE.value_counts()

protein           29865
genomic_entity       65
Name: TYPE, dtype: int64

In [101]:
nodes[nodes.umls_type_label.apply(lambda x:"Nucleic Acid, Nucleoside, or Nucleotide" in x)].TYPE.value_counts()

genomic_entity       1906
Chemicals & Drugs     888
protein                10
Name: TYPE, dtype: int64

In [102]:
del nodes['umls_type_label']
del nodes['umls_type']

In [103]:
nodes.to_csv("nodes_fix_types.csv")