In [None]:
import nltk
from nltk.corpus import wordnet as wn

# nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RhysL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2. Select Two Words

Let’s use:

"car"

"automobile"

These words are known to be synonyms. We can explore their synsets.

In [2]:
word1 = 'car'
word2 = 'automobile'

synsets_car = wn.synsets(word1)
synsets_auto = wn.synsets(word2)

print("Car synsets:")
for s in synsets_car:
    print(f"{s.name()}: {s.definition()}")

print("\nAutomobile synsets:")
for s in synsets_auto:
    print(f"{s.name()}: {s.definition()}")


Car synsets:
car.n.01: a motor vehicle with four wheels; usually propelled by an internal combustion engine
car.n.02: a wheeled vehicle adapted to the rails of railroad
car.n.03: the compartment that is suspended from an airship and that carries personnel and the cargo and the power plant
car.n.04: where passengers ride up and down
cable_car.n.01: a conveyance for passengers or freight on a cable railway

Automobile synsets:
car.n.01: a motor vehicle with four wheels; usually propelled by an internal combustion engine
automobile.v.01: travel in an automobile


### 3. Check for Synonymy

Let’s check if they share the same synset:


In [3]:
# Check intersection
common_synsets = set(synsets_car).intersection(synsets_auto)
print("Common synsets between 'car' and 'automobile':")
for syn in common_synsets:
    print(f"{syn.name()} - {syn.definition()}")


Common synsets between 'car' and 'automobile':
car.n.01 - a motor vehicle with four wheels; usually propelled by an internal combustion engine


### 4. Explore Hypernyms and Hyponyms

This shows that:

A hypernym of "car" is "motor_vehicle".

Some hyponyms include "cab", "limousine", and "sports_car".

In [4]:
car_synset = wn.synset('car.n.01')

print("Hypernyms of 'car.n.01':")
for h in car_synset.hypernyms():
    print(f"{h.name()} - {h.definition()}")

print("\nHyponyms of 'car.n.01':")
for h in car_synset.hyponyms()[:5]:  # limiting to 5
    print(f"{h.name()} - {h.definition()}")

print("\nRoot hypernyms of 'car.n.01':")
for h in car_synset.root_hypernyms():
    print(f"{h.name()} - {h.definition()}")

Hypernyms of 'car.n.01':
motor_vehicle.n.01 - a self-propelled wheeled vehicle that does not run on rails

Hyponyms of 'car.n.01':
ambulance.n.01 - a vehicle that takes people to and from hospitals
roadster.n.01 - an open automobile having a front seat and a rumble seat
convertible.n.01 - a car that has top that can be folded or removed
gas_guzzler.n.01 - a car with relatively low fuel efficiency
subcompact.n.01 - a car smaller than a compact car

Root hypernyms of 'car.n.01':
entity.n.01 - that which is perceived or known or inferred to have its own distinct existence (living or nonliving)


### 5. Semantic Similarity
Let’s measure how similar two concepts are using path similarity.
This returns a score between 0 and 1:

1.0 → identical synsets

Lower values → more distant concepts

In [8]:
car_syn = wn.synset('car.n.01')
bus_syn = wn.synset('bus.n.01')  # another type of vehicle

similarity = car_syn.path_similarity(bus_syn)
print(f"Path similarity between 'car' and 'bus': {similarity}")

Path similarity between 'car' and 'bus': 0.125


# Tags and TF-IDF

We are identifying which tags are **most semantically similar** to a set of high TF-IDF keywords by:

1. **Decomposing compound tags** (e.g., `model_explainability` → `["model", "explainability"]`).
2. **Retrieving WordNet synsets** for both keywords and tag components.
3. **Computing path-based semantic similarity** between each keyword and all components of each tag.
4. **Storing the highest similarity score** for each keyword–tag pair.

This allows us to later **rank tags** based on how semantically related they are to the most important words in a document.  
#NLP #model_explainability #analysis

In [24]:
from nltk.corpus import wordnet as wn
from itertools import combinations
import numpy as np
import json

In [30]:
OUTPUT_PATH="../Data/enhanced_vault_index.json"
# # read json file
with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
    vault_index = json.load(f)

In [54]:
# node_id="acid_transaction"
node_id="api_driven_microservices"
# node_id="attention_mechanism"
# node_id="active_learning"

node=vault_index[node_id]

# Given keywords and their TF-IDF scores
tfidf_terms = node["TFIDF_Score"]

# Tag stems (simplified representation of tags)
tags = [
    "classifier", "regressor", "clustering", "deep_learning", "anomaly_detection",
    "ml_process", "ml_optimisation", "model_explainability", "evaluation",
    "model_algorithm", "model_architecture",
    "data_cleaning", "data_transformation", "data_processing", "data_engineering",
    "data_governance", "data_management", "data_quality",
    "database", "database_design", "relational_database", "database_optimisation",
    "data_storage", "data_modeling",
    "event_driven", "data_orchestration", "data_streaming", "data_workflow",
    "cloud_computing", "querying", "big_data",
    "data_exploration", "communication", "data_visualization", "business_intelligence",
    "software", "code_snippet", "software_architecture",
    "statistics", "math",
    "GenAI", "language_models", "NLP",
    "career", "field", "question", "drafting", "business"
]


In [55]:
# --- STEP 1: Helper to get synset for a word (first synset as proxy)
def get_first_synset(word):
    synsets = wn.synsets(word)
    return synsets[0] if synsets else None

# --- STEP 2: Synsets for keywords
keyword_synsets = {
    word: get_first_synset(word) for word in tfidf_terms
}

# --- STEP 3: Decompose tags and get synsets for each part
tag_word_map = {tag: tag.split('_') for tag in tags}
tag_synsets = {
    tag: [get_first_synset(word) for word in words if get_first_synset(word)]
    for tag, words in tag_word_map.items()
}

# --- STEP 4: Compute semantic similarity between each keyword synset and all tag synsets
similarity_scores = {}

for kw, kw_syn in keyword_synsets.items():
    if not kw_syn:
        continue
    for tag, synset_list in tag_synsets.items():
        max_sim = 0
        for tag_syn in synset_list:
            sim = kw_syn.path_similarity(tag_syn)
            if sim and sim > max_sim:
                max_sim = sim
        if max_sim > 0:
            similarity_scores[(kw, tag)] = max_sim


In [57]:
# Aggregate tag similarity by summing top N contributions
tag_aggregate_scores = {}

for (keyword, tag), sim in similarity_scores.items():
    tag_aggregate_scores.setdefault(tag, []).append(sim)

# Average top 3 similarities for each tag
tag_avg_sim = {
    tag: np.mean(sorted(sims, reverse=True)[:3])
    for tag, sims in tag_aggregate_scores.items()
}

# Get top 3 tags
top_3_tags = sorted(tag_avg_sim.items(), key=lambda x: x[1], reverse=True)[:3]
top_3_tags


[('event_driven', np.float64(0.45555555555555555)),
 ('model_architecture', np.float64(0.4083333333333334)),
 ('software_architecture', np.float64(0.4083333333333334))]