This script is to pull from the existing NASA SMD vocabularies as compiled by the Science Discovery Engine project to create a database of all Heliophysics terms, whether they are defined, the number of times they are defined, and any gaps

It is related to the "compile_DB_existingVocabularies.ipynb" script, which uses a different set of space weather and helio relevant vocabularies

We will not attempt to bring in all glossaries identified by the SDE project here. The subset we will begin with are: 

- HELIO Ontology
- Heliophysics Event Knowledge Base
- NASA CCMC
- SWEET
- SPASE
- NASA Heliophysics Vocabulary
- Space Weather Glossary
- Space Weather Glossary (Second)
- ESA Space Weather Glossary (ESA)
- Unified Astronomy Thesaurus
- AGU index terms




In [20]:
import os, sys, re
import pandas as pd
import numpy as np

from glob import glob
import json

In [10]:
# glob("/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/*.json", recursive=True)

glossaries_list = [
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/HELIO Ontology.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Heliophysics Events Knowledgebase.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/NASA CCMC.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Semantic Web for Earth and Environment Technology Ontology.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/SPASE Dictionary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/NASA Heliophysics Vocabulary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Space Weather Glossary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Space Weather Glossary (Second).json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/ESA Space Weather Glossary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Unified Astronomy Thesaurus (UAT).json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/AGU Index Terms.json',
]

In [None]:
pd_compiled = pd.DataFrame(columns=['term','definition','source'])


In [60]:

terms_full = []
definitions_full = []
sources_full = []

for g in glossaries_list:#[0:1]:
    print('working on {}...'.format(g[101:]))
    
    with open(g) as json_file:
        content = json.load(json_file)
        terms_loop = []
        definitions_loop = []
        sources_loop = []
        for inx, term in enumerate(content["Terms"]):
            terms_loop.append(re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', term['Term']).lower())
            definitions_loop.append(term['Definition'])
        sources_loop = list(np.tile(str(g[101:]),(len(terms_loop),1)).flatten())
    
    # Add to full lists
    terms_full = terms_full + terms_loop
    definitions_full = definitions_full + definitions_loop
    sources_full = sources_full + sources_loop
    

working on HELIO Ontology.json...
working on Heliophysics Events Knowledgebase.json...
working on NASA CCMC.json...
working on Semantic Web for Earth and Environment Technology Ontology.json...
working on SPASE Dictionary.json...
working on NASA Heliophysics Vocabulary.json...
working on Space Weather Glossary.json...
working on Space Weather Glossary (Second).json...
working on ESA Space Weather Glossary.json...
working on Unified Astronomy Thesaurus (UAT).json...
working on AGU Index Terms.json...


In [61]:
pd_full = pd.DataFrame({'term':terms_full,'definition':definitions_full,'type':sources_full})
pd_full

Unnamed: 0,term,definition,type
0,thing,,HELIO Ontology.json
1,data resource,,HELIO Ontology.json
2,service,,HELIO Ontology.json
3,catalog,A tabular listing of events or observational n...,HELIO Ontology.json
4,record,,HELIO Ontology.json
...,...,...,...
19357,new fields,,AGU Index Terms.json
19358,notices and announcements,,AGU Index Terms.json
19359,techniques applicable in three or more fields,,AGU Index Terms.json
19360,corrections,,AGU Index Terms.json


In [75]:
counts_loop = []
for t in range(len(pd_full)):
    counts_loop.append(list(pd_full['term'].values).count(pd_full['term'][t]))
    
pd_full['occurrences'] = counts_loop

In [76]:
pd_full

Unnamed: 0,term,definition,type,occurrences
0,thing,,HELIO Ontology.json,3
1,data resource,,HELIO Ontology.json,1
2,service,,HELIO Ontology.json,3
3,catalog,A tabular listing of events or observational n...,HELIO Ontology.json,3
4,record,,HELIO Ontology.json,3
...,...,...,...,...
19357,new fields,,AGU Index Terms.json,2
19358,notices and announcements,,AGU Index Terms.json,1
19359,techniques applicable in three or more fields,,AGU Index Terms.json,1
19360,corrections,,AGU Index Terms.json,1


In [91]:
occurrence_threshold = 2
mask_occurrences = pd_full['occurrences'].values>occurrence_threshold
common_terms = set( (pd_full['term'].values[mask_occurrences] ) )

print('length of common terms list for occurrences > {} = {}'.format(occurrence_threshold,len(common_terms)))

length of common terms list for occurrences > 2 = 495


In [92]:
common_terms


{'53 persei stars',
 'a dwarf stars',
 'a giant stars',
 'a subdwarf stars',
 'a subgiant stars',
 'a supergiant stars',
 'absorption',
 'active',
 'active galactic nuclei',
 'active region',
 'ae stars',
 'aerosol',
 'affine invariant',
 'akasofu epsilon',
 'albedo',
 'alfven velocity',
 'alfven waves',
 'algol variable stars',
 'alpha particle',
 'am canum venaticorum stars',
 'am herculis stars',
 'am stars',
 'annotation',
 'antenna',
 'ap stars',
 'aphelion',
 'astronomy databases',
 'atmospheres ',
 'atmospheric tides',
 'atom',
 'aurora',
 'auroral region',
 'autumnal equinox',
 'b dwarf stars',
 'b giant stars',
 'b subdwarf stars',
 'b subgiant stars',
 'b supergiant stars',
 'b(e) stars',
 'barium stars',
 'be stars',
 'beta cephei variable stars',
 'beta lyrae stars',
 'big bang nucleosynthesis',
 'binary pulsars',
 'biogeochemical cycles, processes, and modeling ',
 'blazars',
 'bok globules',
 'bow shock',
 'bow shock crossing',
 'bp stars',
 'brightness temperature',
 'br

In [84]:
pd_full.to_csv(os.path.join('data','SMD_glossaries_combined.csv'),index=False)


In [83]:
os.getcwd()


'/Users/ryanmcgranaghan/Documents/Helio_ECIP/dev/Helio-KNOW/ADS_enrichment'