# Development notebook for creating a MeSH NXOntology via SPARQL queries

In [1]:
# type: ignore
%load_ext autoreload
%autoreload 2

import pandas as pd
from nxontology_data.mesh.mesh import MeshLoader

## load RDF

In [2]:
%%time
mesh_year = "2021"
rdf = MeshLoader.get_mesh_rdf(mesh_year)

CPU times: user 9min 1s, sys: 9.33 s, total: 9min 10s
Wall time: 17min 31s


## identifiers

In [3]:
%%time
id_df = MeshLoader.run_query(rdf, "identifiers", cache=True)
id_df.head(2)

CPU times: user 6min 38s, sys: 1.2 s, total: 6min 39s
Wall time: 6min 39s


Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label
0,C000002,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000002,bevonium
1,C000006,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000006,"insulin, neutral"


In [4]:
id_df.mesh_class.value_counts()

Term                      835307
Concept                   454398
SCR_Chemical              245164
SCR_Organism               65256
TopicalDescriptor          29331
SCR_Disease                 6526
SCR_Protocol                1217
GeographicalDescriptor       397
PublicationType              187
Qualifier                     76
CheckTag                       2
Name: mesh_class, dtype: int64

In [5]:
# missing labels
# id_df[id_df.mesh_label.isna()]
id_df.mesh_label.isna().sum()

0

In [6]:
# duplicate rows
# id_df[id_df.mesh_id.duplicated(keep=False)]
id_df.mesh_id.duplicated().any()

False

In [7]:
id_df.query("mesh_class=='Qualifier'").head(2)

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label
802478,Q000000981,Qualifier,http://id.nlm.nih.gov/mesh/2021/Q000000981,diagnostic imaging
802479,Q000002,Qualifier,http://id.nlm.nih.gov/mesh/2021/Q000002,abnormalities


## tree numbers

In [8]:
%%time
tree_number_df = MeshLoader.run_query(rdf, "tree-numbers", cache=True)
tree_number_df.head(4)

CPU times: user 12.3 s, sys: 0 ns, total: 12.3 s
Wall time: 12.3 s


Unnamed: 0,mesh_id,tree_number
0,D000001,D03.633.100.221.173
1,D000002,D02.705.400.625.800
2,D000002,D02.705.539.345.800
3,D000002,D02.886.300.692.800


## load into networkx

Currently, the ontology excludes qualifiers and descriptor-qualifier pairs. Future work should determine how to handle these instances.

In [9]:
%%time
nxo = MeshLoader.create_nxo(rdf, year_yyyy=mesh_year)
nxo.n_nodes, nxo.graph.number_of_edges()

CPU times: user 7min 12s, sys: 990 ms, total: 7min 13s
Wall time: 7min 13s


(348080, 433270)

In [10]:
id_df["in_nxo"] = id_df.mesh_id.isin(set(nxo.graph))
id_df.head(2)

Unnamed: 0,mesh_id,mesh_class,mesh_uri,mesh_label,in_nxo
0,C000002,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000002,bevonium,True
1,C000006,SCR_Chemical,http://id.nlm.nih.gov/mesh/2021/C000006,"insulin, neutral",True


In [11]:
pd.crosstab(id_df.mesh_class, id_df.in_nxo).reset_index()

in_nxo,mesh_class,False,True
0,CheckTag,0,2
1,Concept,454398,0
2,GeographicalDescriptor,0,397
3,PublicationType,0,187
4,Qualifier,76,0
5,SCR_Chemical,0,245164
6,SCR_Disease,0,6526
7,SCR_Organism,0,65256
8,SCR_Protocol,0,1217
9,Term,835307,0


In [12]:
# Nodes in graph that are not in the id_df. e.g. Descriptor Qualifier Pairs
assert len(set(nxo.graph) - set(id_df.mesh_id)) == 0

In [15]:
%%time
top_map_df = MeshLoader.create_top_level_map_df(nxo)
top_map_df.head()

CPU times: user 29.3 s, sys: 172 ms, total: 29.5 s
Wall time: 29.5 s


Unnamed: 0,mesh_id,mesh_label,mesh_class,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,depth
466639,D001829,Body Regions,TopicalDescriptor,D001829,A01,Body Regions,False,0
466849,D001940,Breast,TopicalDescriptor,D001829,A01,Body Regions,False,1
472065,D005121,Extremities,TopicalDescriptor,D001829,A01,Body Regions,False,1
473889,D006257,Head,TopicalDescriptor,D001829,A01,Body Regions,False,1
478907,D009333,Neck,TopicalDescriptor,D001829,A01,Body Regions,False,1


In [16]:
# top level terms and the number of descendants by class
top_summary_df = (
    top_map_df
    [["top_mesh_id", "top_tree_number", "top_mesh_label", "top_is_disease"]]
    .drop_duplicates()
    .merge(
        top_map_df
        .groupby(["top_mesh_id", "mesh_class"])
        .size()
        .reset_index(name='n_descendants')
        .pivot_table(index="top_mesh_id", columns="mesh_class", values="n_descendants", fill_value=0)
        .reset_index()
    )
)
top_summary_df.head(2)

Unnamed: 0,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,GeographicalDescriptor,SCR_Chemical,SCR_Disease,SCR_Organism,SCR_Protocol,TopicalDescriptor
0,D001829,A01,Body Regions,False,0,1,7,0,0,266
1,D009141,A02,Musculoskeletal System,False,0,4,5,0,0,242


### disease subset

In [18]:
top_summary_df.query("top_is_disease").drop(
    columns=["GeographicalDescriptor", "SCR_Organism", "SCR_Protocol"]
)

Unnamed: 0,top_mesh_id,top_tree_number,top_mesh_label,top_is_disease,SCR_Chemical,SCR_Disease,TopicalDescriptor
26,D007239,C01,Infections,True,1,119,798
27,D009369,C04,Neoplasms,True,0,401,704
28,D009140,C05,Musculoskeletal Diseases,True,0,1750,392
29,D004066,C06,Digestive System Diseases,True,0,302,321
30,D009057,C07,Stomatognathic Diseases,True,0,354,229
31,D012140,C08,Respiratory Tract Diseases,True,1,162,247
32,D010038,C09,Otorhinolaryngologic Diseases,True,0,409,132
33,D009422,C10,Nervous System Diseases,True,0,2335,998
34,D005128,C11,Eye Diseases,True,0,910,270
35,D052801,C12,Male Urogenital Diseases,True,0,434,265


In [19]:
all_diseases = set(top_map_df.query("top_is_disease").mesh_id)
len(all_diseases)

11251