# Development notebook for creating a MeSH NXOntology via SPARQL queries

In [1]:
# type: ignore
%load_ext autoreload
%autoreload 2

import pandas as pd

from nxontology_data.mesh.mesh import MeshLoader
# from IPython.display import Image
# from networkx.drawing.nx_agraph import to_agraph

## load RDF

In [2]:
%%time
mesh_year = "2022"
rdf = MeshLoader.get_mesh_rdf(mesh_year)

CPU times: user 9min 38s, sys: 10.8 s, total: 9min 49s
Wall time: 10min 3s


## subclass graph

In [3]:
vocab = MeshLoader.create_vocab_digraph(rdf)
# gviz = to_agraph(vocab)
# gviz.layout("dot")
# Image(gviz.draw(format="png"))

## identifiers

In [4]:
%%time
id_df = MeshLoader.run_query(rdf, "identifiers", cache=True)
id_df.head(2)

CPU times: user 17min 23s, sys: 4.46 s, total: 17min 27s
Wall time: 17min 28s


Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,mesh_date_created,mesh_date_revised,mesh_date_established,mesh_frequency,mesh_description
0,C000002,SCR_Chemical,http://id.nlm.nih.gov/mesh/2022/C000002,bevonium,1971-01-01,2018-09-24,,1.0,structure given in first source
1,C000006,SCR_Chemical,http://id.nlm.nih.gov/mesh/2022/C000006,"insulin, neutral",1971-01-01,2017-10-04,,263.0,"a neutral, buffered solution of pork insulin"


In [5]:
id_df.mesh_class.value_counts()

Term                      843575
Concept                   457296
SCR_Chemical              246750
SCR_Organism               65233
TopicalDescriptor          29607
SCR_Disease                 6540
SCR_Protocol                1217
GeographicalDescriptor       398
PublicationType              187
Qualifier                     76
CheckTag                       2
Name: mesh_class, dtype: int64

In [6]:
# missing labels
# id_df[id_df.mesh_label.isna()]
id_df.mesh_label.isna().sum()

0

In [7]:
# duplicate rows
# id_df[id_df.mesh_id.duplicated(keep=False)]
id_df.mesh_id.duplicated().any()

False

In [8]:
id_df.query("mesh_class=='Qualifier'").head(2)

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,mesh_date_created,mesh_date_revised,mesh_date_established,mesh_frequency,mesh_description
807230,Q000000981,Qualifier,http://id.nlm.nih.gov/mesh/2022/Q000000981,diagnostic imaging,2016-06-29,2016-06-08,2017-01-01,,subheading only; coordinate with specific ima...
807231,Q000002,Qualifier,http://id.nlm.nih.gov/mesh/2022/Q000002,abnormalities,1973-12-27,2015-07-01,1966-01-01,,subhead only; congenital & structural only: do...


## tree numbers

In [9]:
%%time
tree_number_df = MeshLoader.run_query(rdf, "tree-numbers", cache=True)
tree_number_df.head(4)

CPU times: user 18.8 s, sys: 4.02 ms, total: 18.8 s
Wall time: 18.8 s


Unnamed: 0,mesh_id,tree_number
0,D000001,D03.633.100.221.173
1,D000002,D02.705.400.625.800
2,D000002,D02.705.539.345.800
3,D000002,D02.886.300.692.800


## load into networkx

Currently, the ontology excludes qualifiers and descriptor-qualifier pairs. Future work should determine how to handle these instances.

In [None]:
%%time
nxo = MeshLoader.create_nxo(rdf, year_yyyy=mesh_year)
nxo.n_nodes, nxo.graph.number_of_edges()

In [None]:
id_df["in_nxo"] = id_df.mesh_id.isin(set(nxo.graph))
id_df.head(2)

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,in_nxo
0,C000002,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000002,bevonium,True
1,C000006,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000006,"insulin, neutral",True


In [None]:
pd.crosstab(id_df.mesh_class, id_df.in_nxo).reset_index()

in_nxo,mesh_class,False,True
0,CheckTag,0,2
1,Concept,454398,0
2,GeographicalDescriptor,0,397
3,PublicationType,0,187
4,Qualifier,76,0
5,SCR_Chemical,0,245164
6,SCR_Disease,0,6526
7,SCR_Organism,0,65256
8,SCR_Protocol,0,1217
9,Term,835307,0


In [None]:
# Nodes in graph that are not in the id_df. e.g. Descriptor Qualifier Pairs
assert len(set(nxo.graph) - set(id_df.mesh_id)) == 0

In [None]:
%%time
top_map_df = MeshLoader.create_top_level_map_df(nxo)
top_map_df.head()

CPU times: user 29.3 s, sys: 172 ms, total: 29.5 s
Wall time: 29.5 s


Unnamed: 0,mesh_id,mesh_label,mesh_class,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,depth
466639,D001829,Body Regions,TopicalDescriptor,D001829,A01,Body Regions,False,0
466849,D001940,Breast,TopicalDescriptor,D001829,A01,Body Regions,False,1
472065,D005121,Extremities,TopicalDescriptor,D001829,A01,Body Regions,False,1
473889,D006257,Head,TopicalDescriptor,D001829,A01,Body Regions,False,1
478907,D009333,Neck,TopicalDescriptor,D001829,A01,Body Regions,False,1


In [None]:
# top level terms and the number of descendants by class
top_summary_df = (
    top_map_df
    [["top_mesh_id", "top_tree_number", "top_mesh_label", "top_is_disease"]]
    .drop_duplicates()
    .merge(
        top_map_df
        .groupby(["top_mesh_id", "mesh_class"])
        .size()
        .reset_index(name='n_descendants')
        .pivot_table(index="top_mesh_id", columns="mesh_class", values="n_descendants", fill_value=0)
        .reset_index()
    )
)
top_summary_df.head(2)

Unnamed: 0,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,GeographicalDescriptor,SCR_Chemical,SCR_Disease,SCR_Organism,SCR_Protocol,TopicalDescriptor
0,D001829,A01,Body Regions,False,0,1,7,0,0,266
1,D009141,A02,Musculoskeletal System,False,0,4,5,0,0,242


### disease subset

In [None]:
top_summary_df.query("top_is_disease").drop(
    columns=["GeographicalDescriptor", "SCR_Organism", "SCR_Protocol"]
)

Unnamed: 0,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,SCR_Chemical,SCR_Disease,TopicalDescriptor
26,D007239,C01,Infections,True,1,119,798
27,D009369,C04,Neoplasms,True,0,401,704
28,D009140,C05,Musculoskeletal Diseases,True,0,1750,392
29,D004066,C06,Digestive System Diseases,True,0,302,321
30,D009057,C07,Stomatognathic Diseases,True,0,354,229
31,D012140,C08,Respiratory Tract Diseases,True,1,162,247
32,D010038,C09,Otorhinolaryngologic Diseases,True,0,409,132
33,D009422,C10,Nervous System Diseases,True,0,2335,998
34,D005128,C11,Eye Diseases,True,0,910,270
35,D052801,C12,Male Urogenital Diseases,True,0,434,265


In [None]:
all_diseases = set(top_map_df.query("top_is_disease").mesh_id)
len(all_diseases)

11251

## GitHub outputs

In [11]:
gh_data_url = "https://github.com/related-sciences/nxontology-data/raw/output/mesh/"

In [16]:
pd.read_json(f"{gh_data_url}mesh_topical_descriptor_descendants_top_level_map.json.gz").sample(5, random_state=0)

Unnamed: 0,mesh_id,mesh_label,mesh_class,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,depth
36053,C000664754,Calonectria longiramosa,SCR_Organism,D056890,B01,Eukaryota,False,4
420240,C474744,"GP90-MC301 protein, rat",SCR_Chemical,D000602,D12,"Amino Acids, Peptides, and Proteins",False,3
102707,C543241,"Cardiomyopathy, fatal fetal, due to myocardial...",SCR_Disease,D013568,C23,"Pathological Conditions, Signs and Symptoms",True,4
146679,C038055,N-nitrosomethyl-N-propylamine,SCR_Chemical,D009930,D02,Organic Chemicals,False,3
572839,C000725016,Sauk-Suiattle Indian Tribe,SCR_Population,D011154,N01,Population Characteristics,False,5


In [17]:
pd.read_json(f"{gh_data_url}mesh_identifiers.json.gz").drop_duplicates("mesh_class")

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,mesh_date_created,mesh_date_revised,mesh_date_established,mesh_frequency,mesh_description,tree_numbers,in_full_nxo,in_desc_nxo
0,C000002,SCR_Chemical,http://id.nlm.nih.gov/mesh/2023/C000002,bevonium,1971-01-01,2018-09-24,,1.0,structure given in first source,,True,True
2346,C000591739,SCR_Disease,http://id.nlm.nih.gov/mesh/2023/C000591739,"familial gynecomastia, due to increased aromat...",2014-11-04,2022-11-02,,5.0,,,True,True
15848,C000612320,SCR_Organism,http://id.nlm.nih.gov/mesh/2023/C000612320,UR2 sarcoma virus,2017-10-18,2020-09-30,,0.0,,,True,True
93595,C000705129,SCR_Protocol,http://id.nlm.nih.gov/mesh/2023/C000705129,LAMP assay,2020-03-28,2020-03-31,,930.0,for use of LAMP assay for detecting COVID-19 a...,,True,True
98889,C000724254,SCR_Population,http://id.nlm.nih.gov/mesh/2023/C000724254,Big Sandy Rancheria of Western Mono Indians of...,2022-11-29,2022-11-29,,0.0,,,True,True
321747,D000001,TopicalDescriptor,http://id.nlm.nih.gov/mesh/2023/D000001,Calcimycin,1974-11-19,2016-05-27,1984-01-01,,,[D03.633.100.221.173],True,True
321964,D000068036,PublicationType,http://id.nlm.nih.gov/mesh/2023/D000068036,Graphic Novel,2015-07-01,2019-07-05,2016-01-01,,This heading is used as a Publication Type. Gr...,[V02.700.415],True,False
321965,D000068037,GeographicalDescriptor,http://id.nlm.nih.gov/mesh/2023/D000068037,South Sudan,2015-07-01,2021-01-26,2016-01-01,,,[Z01.058.290.120.745],True,False
329835,D005260,CheckTag,http://id.nlm.nih.gov/mesh/2023/D005260,Female,1999-01-01,2015-06-10,1966-01-01,,"check tag only for female organs, diseases, ph...",,True,False
352201,M0000001,Concept,http://id.nlm.nih.gov/mesh/2023/M0000001,Calcimycin,,,,,"An ionophorous, polyether antibiotic from Stre...",,False,False


In [18]:
pd.read_json(f"{gh_data_url}mesh_synonyms.json.gz").sample(5, random_state=0)

Unnamed: 0,mesh_id,concept_id,term_id,mesh_label,concept_label,term_label,concept_is_preferred,term_is_preferred,term_label_is_preferred,term_lexical_tag,term_date_created,concept_relation_to_preferred
704006,C577955,M0581027,T836700,"3-(2-chlorophenyl)-1-(2,4-dihydroxyphenyl)prop...",NPD170,NPD170,False,True,True,LAB,2013-02-03,narrower
100718,C000674955,M000700981,T001033762,Derxomyces nakasei,Derxomyces nakasei,Bullera nakasei,True,False,True,NON,2020-03-26,exact
586186,C509465,M0497018,T671283,meso-tetrakis(heptafluoropropyl)porphyrin,meso-tetrakis(heptafluoropropyl)porphyrin,meso-tetrakis(heptafluoropropyl)porphyrin,True,True,True,NON,2006-04-13,exact
503573,C459630,M0430868,T503918,ent-beyer-15-en-18-O-oxalate,ent-beyer-15-en-18-O-oxalate,ent-beyer-15-en-18-O-oxalate,True,True,True,NON,2002-07-14,exact
905742,D020526,M0328180,T000952246,Brain Stem Infarctions,Brain Stem Infarctions,Brain Stem Infarcts,True,False,False,NON,2019-02-26,exact


In [19]:
pd.read_json(f"{gh_data_url}mesh_descriptor_qualifier_pairs.json.gz").sample(5, random_state=0)

Unnamed: 0,pair_label,pair_allowed,pair_type,pair_uri,descriptor_id,qualifier_id,descriptor_label,qualifier_label,use_instead_uri,use_instead_label,use_instead_class
45319,Masked Mycotoxins/physiology,,AllowedDescriptorQualifierPair,http://id.nlm.nih.gov/mesh/2023/D000083842Q000502,D000083842,Q000502,Masked Mycotoxins,physiology,,,
114717,Chromium Alloys/analysis,,AllowedDescriptorQualifierPair,http://id.nlm.nih.gov/mesh/2023/D002858Q000032,D002858,Q000032,Chromium Alloys,analysis,,,
251852,o-Phthalaldehyde/standards,,AllowedDescriptorQualifierPair,http://id.nlm.nih.gov/mesh/2023/D009764Q000592,D009764,Q000592,o-Phthalaldehyde,standards,,,
369654,HLA-B8 Antigen/cerebrospinal fluid,,AllowedDescriptorQualifierPair,http://id.nlm.nih.gov/mesh/2023/D015795Q000134,D015795,Q000134,HLA-B8 Antigen,cerebrospinal fluid,,,
350916,Vertebral Artery/enzymology,,AllowedDescriptorQualifierPair,http://id.nlm.nih.gov/mesh/2023/D014711Q000201,D014711,Q000201,Vertebral Artery,enzymology,,,
