This script reads synthesized terms in the [space weather harmonization spreadsheet](https://docs.google.com/spreadsheets/d/1BvmbmcmKB9_95pd6_nknf0cBQMH76OxM4cfEZFbFiTk/edit?usp=sharing) and looks through the following glossaries for definitions and pulls out all that exist:

- HELIO Ontology
- Heliophysics Event Knowledge Base
- NASA CCMC
- SWEET
- SPASE
- NASA Heliophysics Vocabulary
- Space Weather Glossary
- Space Weather Glossary (Second)
- ESA Space Weather Glossary (ESA)
- Unified Astronomy Thesaurus
- AGU index terms




In [12]:
list_glossaries = ['HELIO Ontology',
                   'Heliophysics Event Knowledge Base',
                   'NASA CCMC',
                   'SWEET',
                   'SPASE',
                   'NASA Heliophysics Vocabulary',
                   'Space Weather Glossary',
                   'Space Weather Glossary (Second)',
                   'ESA Space Weather Glossary (ESA)',
                   'Unified Astronomy Thesaurus',
                   'AGU index terms']

In [2]:
import os, sys, re
import pandas as pd
import numpy as np

from glob import glob
import json



#### Read and compile glossaries



In [4]:
glossaries_list = [
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/HELIO Ontology.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Heliophysics Events Knowledgebase.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/NASA CCMC.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Semantic Web for Earth and Environment Technology Ontology.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/SPASE Dictionary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/NASA Heliophysics Vocabulary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Space Weather Glossary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Space Weather Glossary (Second).json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/ESA Space Weather Glossary.json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/Unified Astronomy Thesaurus (UAT).json',
 '/Users/ryanmcgranaghan/Documents/NASA_FDL/2022/sdm-kg-nasa-2022/data/raw/SMDVocabulary/SupersetVocab/AGU Index Terms.json',
]

pd_compiled = pd.DataFrame(columns=['term','definition','source'])

terms_full = []
definitions_full = []
sources_full = []

for g in glossaries_list:#[0:1]:
    print('working on {}...'.format(g[101:]))
    
    with open(g) as json_file:
        content = json.load(json_file)
        terms_loop = []
        definitions_loop = []
        sources_loop = []
        for inx, term in enumerate(content["Terms"]):
            terms_loop.append(re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', term['Term']).lower())
            definitions_loop.append(term['Definition'])
        sources_loop = list(np.tile(str(g[101:]),(len(terms_loop),1)).flatten())
    
    # Add to full lists
    terms_full = terms_full + terms_loop
    definitions_full = definitions_full + definitions_loop
    sources_full = sources_full + sources_loop
    
pd_full = pd.DataFrame({'term':terms_full,'definition':definitions_full,'type':sources_full})
pd_full

working on HELIO Ontology.json...
working on Heliophysics Events Knowledgebase.json...
working on NASA CCMC.json...
working on Semantic Web for Earth and Environment Technology Ontology.json...
working on SPASE Dictionary.json...
working on NASA Heliophysics Vocabulary.json...
working on Space Weather Glossary.json...
working on Space Weather Glossary (Second).json...
working on ESA Space Weather Glossary.json...
working on Unified Astronomy Thesaurus (UAT).json...
working on AGU Index Terms.json...


Unnamed: 0,term,definition,type
0,thing,,HELIO Ontology.json
1,data resource,,HELIO Ontology.json
2,service,,HELIO Ontology.json
3,catalog,A tabular listing of events or observational n...,HELIO Ontology.json
4,record,,HELIO Ontology.json
...,...,...,...
19357,new fields,,AGU Index Terms.json
19358,notices and announcements,,AGU Index Terms.json
19359,techniques applicable in three or more fields,,AGU Index Terms.json
19360,corrections,,AGU Index Terms.json


#### Read terms synthesized from compile_SMD_vocabs_Helio.ipynb and manual process

In [7]:
file_synthesized_terms = '/Users/ryanmcgranaghan/Documents/Helio_ECIP/dev/Helio-KNOW/ADS_enrichment/data/SMD_and_manual_synthesized_terms.xlsx'

pd_terms = pd.read_excel(file_synthesized_terms)
pd_terms = pd_terms.dropna()


In [8]:
pd_terms

Unnamed: 0,Terms
0,solar flare
1,flux rope emergence
2,sunspot
3,magnetic reconnection
4,solar cycle
...,...
148,trojan asteroids
149,aurora
151,magnetic field
152,magnetosphere


In [28]:
pd_full

Unnamed: 0,term,definition,type
0,thing,,HELIO Ontology.json
1,data resource,,HELIO Ontology.json
2,service,,HELIO Ontology.json
3,catalog,A tabular listing of events or observational n...,HELIO Ontology.json
4,record,,HELIO Ontology.json
...,...,...,...
19357,new fields,,AGU Index Terms.json
19358,notices and announcements,,AGU Index Terms.json
19359,techniques applicable in three or more fields,,AGU Index Terms.json
19360,corrections,,AGU Index Terms.json


In [41]:
idx = np.argwhere(pd_full['term'].values == 'aurora')

In [40]:
if pd_full.iloc[idx[6]]['definition'].values:
    print('yes')
# pd_full.iloc[idx[1]]['type'].values[0]

In [60]:
defs_loop = []
for i in idx:
#     print(pd_full.iloc[i])
    if pd_full.iloc[i]['definition'].values:
        defs_loop.append( pd_full.iloc[i]['definition'].values[0] + str('    source: ').format() + pd_full.iloc[i]['type'].values[0])
#         defs_loop.append(' ')
        

        
        

In [61]:
defs_loop

["An atmospheric phenomenon consisting of bands of light caused by charged solar particles following the earth's magnetic lines of force. (spase)    source: HELIO Ontology.json",
 ' ',
 'transient displays of light, often displaying as moving curtains and rays, at high latitudes associated with geomagnetic disturbances    source: NASA CCMC.json',
 ' ',
 'The sporadic radiant emission from the upper atmosphere over the middle and high latitudes.    source: Semantic Web for Earth and Environment Technology Ontology.json',
 ' ',
 "An atmospheric phenomenon consisting of bandsof light caused by charged solar particles following the earth's magneticlines of force.    source: SPASE Dictionary.json",
 ' ',
 "An aurora is a natural display of light in the night sky that typically occurs in far northern and southern regions. Auroras occur when incoming charged particles from the sun strike oxygen and nitrogen some 60 to 200 miles up in Earth's atmosphere and release a flash of light and heat. E

In [68]:

terms = []
defs = []

for t in pd_terms['Terms']:
#     print(t)
    
    terms.append(t)
    
    idx = np.argwhere(pd_full['term'].values == t)
    defs_loop = []
    for i in idx:
        if pd_full.iloc[i]['definition'].values:
            defs_loop.append( pd_full.iloc[i]['definition'].values[0] + str('    source: ').format() + pd_full.iloc[i]['type'].values[0])
        else: 
            defs_loop
    defs.append(defs_loop)
    
#     print('--------------\n\n')

In [69]:
pd_terms_defs = pd.DataFrame(columns=['term','definitions'])
pd_terms_defs['term'] = terms
pd_terms_defs['definitions'] = defs
pd_terms_defs


Unnamed: 0,term,definitions
0,solar flare,[An explosive event in the Sun's atmosphere wh...
1,flux rope emergence,[]
2,sunspot,[Places on Solar Surface where Flux Tubes brea...
3,magnetic reconnection,[Magnetic reconnection is a physical process i...
4,solar cycle,[The sun goes through 11-year variations or cy...
...,...,...
119,trojan asteroids,[A member of the family of asteroids that shar...
120,aurora,[An atmospheric phenomenon consisting of bands...
121,magnetic field,[A region of space near a magnetized body wher...
122,magnetosphere,[The region of space dominated by the magnetic...


In [70]:
pd_terms_defs.to_csv('/Users/ryanmcgranaghan/Documents/Helio_ECIP/dev/Helio-KNOW/ADS_enrichment/data/SMD_and_manual_synthesized_terms_definitions.csv',index=False)