In [1]:
import json

In [2]:
with open('cnsc_full_glossary_final.json', 'r') as f:
    cnsc_data = json.load(f)

In [3]:
with open('usnrc_glossary_final.json', 'r') as f:
    usnrc_data = json.load(f)

In [4]:
cnsc_data

{'A1, A2': 'Ha[ve] the same meanings as in the IAEA Regulations. (Source: Packaging and Transport of Nuclear Substances Regulations, 2015 ) Note: A 1 and A 2 are values specific to each radionuclide and are used to determine the activity limits for transport packages.',
 'abandon': 'Remove from regulatory control.',
 'abiotic': 'Relating to the non-living parts of the environment such as air, rock, soil and water. Some abiotic components are topography, hydrology, drainage, climate, meteorology, and land-use patterns by members of the public.',
 'abnormal incident': 'An abnormal occurrence that may have a significant cause and/or may lead to more serious consequences.',
 'Aboriginal peoples of Canada': 'Includes the Indian, Inuit and Métis peoples of Canada. (Source: Constitution Act, 1982 ) Note: Outside of legal contexts, the term First Nations is used in place of the term Indian. See also Indigenous peoples .',
 'absent from duty': 'Not available on a day normally scheduled for duty

In [5]:
usnrc_data

{'10 CFR': 'Title 10 of the Code of Federal Regulations',
 '3WFN': 'Three White Flint North',
 '8(a)': '8(a) Business Development Program',
 'A-E': 'architect-engineer',
 'AASG': 'Association of American State Geologists',
 'ABWR': 'advanced boiling-water reactor',
 'AC': 'alternating current',
 'ACC': 'additional Commission comments',
 'Access hatch': 'An airtight door system that preserves the pressure integrity of the containment structure of a nuclear reactor , while allowing access to personnel and equipment.',
 'ACHP': 'Advisory Council on Historic Preservation',
 'ACI': 'American Concrete Institute',
 'ACL': 'alternate concentration limit',
 'ACMUI': 'Advisory Committee on the Medical Use of Isotopes',
 'ACQC': 'American Society for Quality Control',
 'ACRS': 'Advisory Committee on Reactor Safeguards',
 'Activation': 'The process of making a radioisotope by bombarding a stable element with neutrons or protons .',
 'Active fuel length': 'The end-to-end dimension of fuel material 

In [6]:
cnsc_terms = set(term.strip().lower() for term in cnsc_data.keys())
usnrc_terms = set(term.strip().lower() for term in usnrc_data.keys())

In [7]:
common_terms = cnsc_terms.intersection(usnrc_terms)
print(f"Number of common terms: {len(common_terms)}")

Number of common terms: 129


In [8]:
matched = []
for term in common_terms:
    matched.append({
        "cnsc_term": term,
        "nrc_term": term,
        "is_same": True
    })

In [21]:
len(matched)

129

Semantic matching to verify that matches are correct

In [11]:
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.1-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (1

In [12]:
%pip install tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [22]:
cnsc_norm = {k.lower().strip(): (k, v) for k, v in cnsc_data.items()}
usnrc_norm = {k.lower().strip(): (k, v) for k, v in usnrc_data.items()}

In [None]:
exact_matches = []
for term in cnsc_norm:
    if term in usnrc_norm:
        orig_cnsc_term, cnsc_def = cnsc_norm[term]
        orig_usnrc_term, usnrc_def = usnrc_norm[term]
        exact_matches.append({
            "cnsc_term": orig_cnsc_term,
            "nrc_term": orig_usnrc_term,
            "cnsc_def": cnsc_def,
            "nrc_def": usnrc_def
        })

In [24]:
print(f"Found {len(exact_matches)} exact term matches.")


Found 129 exact term matches.


In [None]:
verified_matches = []
all = []
threshold = 0.27  # you can adjust this, I found 0.27 to be a good threshold for this dataset
for match in tqdm(exact_matches):
    emb1 = model.encode(match["cnsc_def"], convert_to_tensor=True)
    emb2 = model.encode(match["nrc_def"], convert_to_tensor=True)
    score = util.cos_sim(emb1, emb2).item()
    all.append({
        "cnsc_term": match["cnsc_term"],
        "nrc_term": match["nrc_term"],
        "similarity": round(score, 4),
        "cnsc_def": match["cnsc_def"],
        "nrc_def": match["nrc_def"],
        "is_same": score >= threshold
    })
    if score >= threshold:
        verified_matches.append({
            "cnsc_term": match["cnsc_term"],
            "nrc_term": match["nrc_term"],
            "similarity": round(score, 4),
            "is_same": True
        })

100%|██████████| 129/129 [00:22<00:00,  5.85it/s]


In [36]:
all.sort(key=lambda x: x["similarity"], reverse=True)

In [37]:
with open('all_matches.json', 'w') as f:
    json.dump(all, f, indent=2)

In [38]:
with open('verified_exact_matches.json', 'w') as f:
    json.dump(verified_matches, f, indent=2)

print(f"✅ Retained {len(verified_matches)} verified exact matches.")

✅ Retained 114 verified exact matches.
