In [None]:
from google.colab import files
uploaded = files.upload()
#import [category_keywords.py, category_tree.json, web_scraper.py]

Saving category_keywords.py to category_keywords.py
Saving category_tree.json to category_tree.json
Saving web_scraper.py to web_scraper.py


In [None]:
import json

# Load the category tree from file
with open("category_tree.json", "r") as f:
    category_tree = json.load(f)


In [None]:
def find_category_path(tree, target_category, path=None):
    if path is None:
        path = []

    for category, children in tree.items():
        new_path = path + [category]
        if category == target_category:
            return new_path
        if isinstance(children, dict):
            result = find_category_path(children, target_category, new_path)
            if result:
                return result
    return None

In [None]:
target = "real_estate"
path = find_category_path(category_tree, target)

if path:
    print("Found path:", " > ".join(path))
else:
    print("Category not found")


Found path: real_estate


In [None]:
def find_top_level_category(tree, target_category, path=None):
    if path is None:
        path = []

    for category, children in tree.items():
        new_path = path + [category]
        if category == target_category:
            return path[0] if path else category
        if isinstance(children, dict):
            result = find_top_level_category(children, target_category, new_path)
            if result:
                return result
    return None

In [None]:
def is_prediction_correct(predicted_top_category, true_category_label):
    true_top_category = find_top_level_category(category_tree, true_category_label)
    return predicted_top_category == true_top_category


In [None]:
predicted = "eat_and_drink"
true_label = "cafe"

print(is_prediction_correct(predicted, true_label))
# Should return True if "ethiopian_restaurant" is in the subtree of "eat_and_drink"


True


In [None]:
def embed_tree_nodes_by_layer(tree, model):
    embeddings = {}

    def recurse(node):
        # Process all keys in the current node
        for key, value in node.items():
            if key == "_keywords":
                continue # Skip the _keywords entry

            # Use the key directly from the tree structure
            # Ensure this key is added to embeddings if it's not already there
            if key not in embeddings:
                try:
                    embeddings[key] = model.encode(key)
                except Exception as e:
                    # Handle potential errors during encoding if needed
                    print(f"Error encoding key '{key}': {e}")
                    # Optionally skip this key or handle it differently
                    continue

            # Recurse into children if the value is a dictionary
            if isinstance(value, dict):
                recurse(value)

    recurse(tree)
    return embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util
from category_keywords import category_keywords

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def get_rule_score(description, node):
    desc = description.lower()
    score = 0
    if "_keywords" in node:
        for word in node["_keywords"]:
            if word in desc:
                score += 1
    return score

def classify_with_layered_tree_top_n(description, tree, embeddings, model, rule_weight=0.5, top_n=3):
    import numpy as np

    desc_embedding = model.encode(description)
    current_node = tree
    current_path = []
    full_result = []  # store top-N scores at each layer

    while True:
        children = [key for key in current_node if key != "_keywords"]
        if not children:
            break

        # Get SBERT scores
        child_vectors = [embeddings[child] for child in children]
        sims = util.cos_sim(desc_embedding, child_vectors)[0].cpu().numpy()

        # Rule scores
        rule_scores = [get_rule_score(description, current_node[child]) for child in children]

        # Combine
        combined_scores = sims + rule_weight * np.array(rule_scores)

        # Rank top-N
        ranked_indices = np.argsort(combined_scores)[::-1][:top_n]
        ranked_children = [(children[i], combined_scores[i]) for i in ranked_indices]
        full_result.append(ranked_children)

        # Pick top-1 to continue deeper
        best_child = ranked_children[0][0]
        current_path.append(best_child)
        current_node = current_node[best_child]

    return " > ".join(current_path), full_result



In [None]:
tree_embeddings = embed_tree_nodes_by_layer(category_keywords, model)

In [None]:
category = "real_estate"

if category in tree_embeddings:
    print(f"'{category}' exists in the embeddings.")
else:
    print(f"'{category}' not found in the embeddings.")


✅ 'real_estate' exists in the embeddings.


In [None]:
pred_path, top_n_per_layer = classify_with_layered_tree_top_n(
    description = "About Our FaithThe following was adopted at the Calvary Armenian Congregational Church Spiritual Retreat in January, 1995:As a Christ-centered Armenian Church, the primary purpose of Calvary Armenian Congregational Church is to proclaim Jesus as Lord and Savior through worship, spiritual nourishment, fellowship, and evangelism - with special focus on the spiritual and physical needs of the Armenian people. A secondary focus is the preservation of our Armenian heritage.””The following is from the Bylaws of the Calvary Armenian Congregational Church:ARTICLE II - FAITH AND COVENANT Confessing Jesus Christ as our Savior and Lord and accepting the Holy Scriptures as the rule of faith and conduct, and recognizing the privilege and duty of uniting with one another for Christian fellowship, for performance of religious rites, for public worship of God, and for the promotion of His Kingdom in the world, we hereby, in the presence of God and beseeching His blessings, solemnly pledge and join unitedly to establish, according to the provisions of the Word of God, a Church of Jesus Christ. We promise to preserve the institutions of the Gospel, to obey the orderly administration of the church, and to walk together with brotherly love. CONFESSION OF FAITH of THE ARMENIAN EVANGELICAL CHURCHConstantinople, Turkey July 1, 1846 (abbreviated)1. I believe in the existence of one only living and true God, the creator, preserver, and governor of the universe.2. I believe that the one God exists in three persons: the Father, the Son, and the Holy Spirit.3. I believe that the Scriptures of the Old and New Testaments were given by inspiration of God, and are a revelation of His will to men, and the sufficient and only rule of faith and practice.4. I believe that mankind, in their natural state, are destitute of holiness, under the power of sin, and worthy of the wrath of God.5. I believe that the Lord Jesus Christ, is the only savior of sinners, and that by His perfect obedience, sufferings, and death, He made full atonement as the only sacrifice for sin, so that all who believe in Him will surely be saved.6. I believe that due to the pervasive sinfulness of man, all must be regenerated by the power of the Holy Spirit in order to be saved.7. I believe that we are justified by the righteousness of Christ alone, through faith, and not by any deeds of our own; and that while good works are inseparable from a true and living faith, they can never be the meritorious ground of salvation before God.8. I believe that holiness of life, and love for God, for our fellowmen, and for ourselves, are essential evidences of the Christian character.9. I believe that, besides God, no other being is to be worshiped and adored, and that all three persons of the sacred Trinity are worthy of our worship, which, to be acceptable, must be offered through no other mediation than that of Jesus Christ.10. I believe that there will be a resurrection of the dead and a day of judgment; and that the happiness of the righteous, and the punishment of the wicked, commence at death, and continue forever.11. I believe that any number of true Christians, duly organized, constitute a Church of Christ, of which Christ is the only Head; and that the sacraments of the Church are baptism and communion.12. I believe that Christ appointed the preaching of the gospel for the conversion of men and for the instruction of His people, and that it is the duty of His church to carry into effect the Savior’s command, “Go into all the world, and preach the gospel to everyone.",
    tree = category_keywords,
    embeddings = tree_embeddings,
    model = model,
    rule_weight = 0.6,
    top_n = 3
)

print("Final predicted path:", pred_path)
print("\nTop candidates at each level:\n")
for level, candidates in enumerate(top_n_per_layer):
    print(f"Level {level + 1}:")
    for name, score in candidates:
        print(f"  {name} - score: {score:.4f}")
    print()

Final predicted path: religious_organization > church_cathedral

Top candidates at each level:

Level 1:
  religious_organization - score: 3.4614
  active_life - score: 0.6563
  mass_media - score: 0.5938

Level 2:
  church_cathedral - score: 1.5832
  temple - score: 1.4259
  mission - score: 0.8523



  a = torch.tensor(a)
