# Process IEEE Thesaurus and ACM vocabulary. 

This notebook needs to be processed only once. Output is stored as a text file with technical terms from both the sources (IEEE and ACM). 


In [13]:
#Install JSON Path as necessary
#!pip install jsonpath-ng
#!pip install bs4

In [14]:
import re
from jsonpath_ng.ext import parse  
import json
from bs4 import BeautifulSoup

## IEEE Thesaurus processing
IEEE thesaurus is only available in PDF format. First convert pdf to text using online tools.
Part of extraction, need to remove headers/footers from the text and remove any classification. The IEEE thesausrus (pdf) can be downlaoded from https://www.ieee.org/publications/services/thesaurus-access-page.html 

In [15]:
 # PS update the path location
with open('../input_files/ieee-thesaurus.txt','r') as f:
    lines = f.readlines()

In [16]:
# remove all header/footer lines, remove unnecessary characters/new lines, empty lines etc
pattern = r'\x0cJULY 2023 IEEE Thesaurus\n'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'^\s*This work is licensed under the Creative Commons.*$'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = pattern = r'^\s*International License.*$'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'Engineers.*$'
lines= [re.sub(pattern,'',line) for line in lines]
#pattern = r'[BT:,RT:,NT:,UF:,USE:]'
pattern = r'BT:'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'RT:'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'NT:'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'USE:'
lines= [re.sub(pattern,'',line) for line in lines]
pattern = r'UF:'
lines= [re.sub(pattern,'',line) for line in lines]
lines = [re.sub('\n','',line) for line in lines]
lines = [re.sub('Page.*$','',line) for line in lines]
lines = [line for line in lines if line.strip()]

In [17]:
lines = list(set(lines[58:])) # remove the headerlines
# Take a peek at the output
len(lines), lines[:5], lines[-5:]

(12476,
 ['APCVD',
  'Active noise reduction',
  'Transportation industry',
  'Particle beam',
  'Optical flow'],
 ['Engineering in medicine and biology',
  'Headphones',
  'Cobalt alloys',
  'Diffusion processes',
  'Semiconductor-insulator'])

In [18]:
with open('../preprocessed_files/ieee_tech_words.txt','w') as nf:
    nf.writelines('\n'.join(map(str, lines)))

## ACM processing
ACM words are available in json format but have HTML tags that need to be removed. ACM vocabulary in json format can be downloaded from:https://csrc.nist.gov/glossary 

In [19]:
# PS update the path location
with open('../input_files/glossary-export.json','r',encoding='utf-8-sig') as f:
    gtext = json.load(f)

In [20]:
# extract terms using json path. We are only interested in the term
jpexpr = parse("parentTerms..term")
terms = [match.value for match in jpexpr.find(gtext)]

In [21]:
terms[:5]

['(EC)DH',
 '(p, t)-completeness',
 '(t + k)-way combination coverage',
 '.csv',
 '[<i>T</i>]<sub>2</sub>']

In [22]:
# remove HTML tags
acm_terms = []
for term in terms:
    if '<' in term:
        soup = BeautifulSoup(term, 'html.parser')
        text_content = soup.get_text()
    else:
        text_content = term
    acm_terms.append(text_content)
print(acm_terms[:5])
acm_terms = list(set(acm_terms))

['(EC)DH', '(p, t)-completeness', '(t + k)-way combination coverage', '.csv', '[T]2']


In [23]:
# append tech terms to IEEE words list
tech_terms = list(set(lines + acm_terms))
len(tech_terms), tech_terms[:5]

(21649,
 ['APCVD',
  'Particle beam',
  'Publish/subscrbe systems',
  'Computer integrated manufacturing',
  'United States Government Configuration Baseline (USGCB)'])

In [24]:
# Store output in a technology terms file
with open('../preprocessed_files/tech_terms.txt','w') as f:
    f.writelines('\n'.join(map(str, tech_terms)))