In [1]:
# First, change the working directory to the project root.
# Example: %cd /home/your_username/MetaHarmonizer

%cd /home/lcc/projects/MetaHarmonizer

/home/lcc/projects/MetaHarmonizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
# Required files:
# data/corpus/oncotree_code_to_name.csv

In [2]:
# This is required to run asyncio code in Jupyter notebooks.
# Jupyter already runs its own event loop, so calling asyncio.run() directly would raise an error.
# nest_asyncio.apply() patches the loop to allow nested usage.

import nest_asyncio

nest_asyncio.apply()

In [3]:
# Import core modules and ontology mapper implementations

import pandas as pd
from importlib import reload

# Import different ontology mapping strategies (ST, LM, RAG, Bi-Encoder)
from src.models import ontology_mapper_st as om_st
from src.models import ontology_mapper_lm as om_lm
from src.models import ontology_mapper_rag as om_rag
from src.models import ontology_mapper_bi_encoder as om_bi

# Import the engine that handles pipeline logic and integrates the mappers
from src.Engine import get_ontology_engine

OntoMapEngine = get_ontology_engine()

# Reload modules to reflect any code updates during development (useful in Jupyter)
reload(om_st)
reload(om_lm)
reload(om_rag)
reload(om_bi)

<module 'src.models.ontology_mapper_bi_encoder' from '/home/lcc/projects/MetaHarmonizer/src/models/ontology_mapper_bi_encoder.py'>

In [4]:
# Import the CalcStats class for calculating Top1, Top3, and Top5 accuracy

from evaluation.calc_stats import CalcStats

calc = CalcStats()

In [None]:
# Optional utility: Clean up the FAISS + SQLite vector store
# Useful after testing or re-running experiments to avoid stale data
# Not required for standard inference or training runs

from src.utils.cleanup_vector_store import cleanup_vector_store

cleanup_vector_store("st", "mt-sap-bert", "disease")  # (strategy, model_name, entity_type)

[Success] Table 'st_mt_sap_bert_disease' dropped from src/KnowledgeDb/vector_db.sqlite
[Success] Index file 'src/KnowledgeDb/faiss_indexes/st_mt-sap-bert_disease.index' deleted.


DF

In [5]:
df = pd.read_csv("data/corpus/cbio_disease/disease_query_updated.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = df['original_value'].tolist()
small_corpus_list = df['curated_ontology'].tolist() # Small corpus list is made of curated ontology values in the query file
large_corpus_list = (
    large_corpus["official_label"].tolist()
    if "official_label" in large_corpus.columns
    else large_corpus["label"].tolist()
)

cura_map = dict(zip(df['original_value'], df['curated_ontology']))

In [6]:
# ST Strategy: use default pooling method. LM Strategy: use CLS token pooling method.
# Arguments:
# - method
# - category
# - topk: retrieve the top k matches
# - query: list of original values to map
# - corpus: list of curated ontology values to match against
# - cura_map: dictionary mapping original values to curated ontology values
# Returns: DF with original values, curated ontology values, match levels, stage, and top k matches with scores

other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='mt-sap-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      s2_strategy="st",
                                      s3_strategy=None,
                                      **other_params)
st_sapbert_disease_top5_result = onto_engine_large.run()

29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Stage 1: Exact matching
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Stage 2: ST
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Stage 3: Disabled
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Starting Ontology Mapping
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Exact matches: 342
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Remaining for Stage 2: 1213
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Stage 2: ST Matching
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
29/10/2025 03:22:10 AM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
29/10/2025 03:22:

No sentence-transformers model found with name model_cache/mt-sap-bert. Creating a new one with mean pooling.
No sentence-transformers model found with name cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token. Creating a new one with mean pooling.
  return forward_call(*args, **kwargs)


29/10/2025 03:22:18 AM - INFO - OntoMapEngine: Stage 2 completed: 1213 queries
29/10/2025 03:22:18 AM - INFO - OntoMapEngine: Stage 3: Disabled
29/10/2025 03:22:18 AM - INFO - OntoMapEngine: FINAL SUMMARY
29/10/2025 03:22:18 AM - INFO - OntoMapEngine: Stage 1 (Exact): 342 queries
29/10/2025 03:22:18 AM - INFO - OntoMapEngine: Stage 2 (ST): 1213 queries


In [7]:
# Calculate Top1, Top3, and Top5 accuracy for the generated results

st_sapbert_accuracy_df = calc.calc_accuracy(st_sapbert_disease_top5_result)
print(st_sapbert_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  75.562701
1  Top 3 Matches  84.694534
2  Top 5 Matches  87.781350


In [8]:
# Save the results to a CSV file for further analysis or reporting. Optional.

st_sapbert_disease_top5_result.to_csv(
    "data/outputs/2025/large_corpus/1024/st_sapbert_disease_top5_result.csv",
    index=False)

In [None]:
# RAG Strategy: Need corpus_df for concept retrieval.
# Example: 
other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      corpus_df=large_corpus,
                                      s2_strategy="st",
                                      s3_strategy="rag",
                                      **other_params)
st_rag_pubmedbert_disease_top5_result = onto_engine_large.run()

29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Stage 1: Exact matching
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Stage 2: ST
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Stage 3: RAG (threshold=0.9)
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Starting Ontology Mapping
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Exact matches: 341
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Remaining for Stage 2: 1214
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Stage 2: ST Matching
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
29/10/2025 03:15:44 AM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
29/10/

  return forward_call(*args, **kwargs)


29/10/2025 03:15:49 AM - INFO - OntoMapEngine: Stage 2 completed: 1214 queries
29/10/2025 03:15:49 AM - INFO - OntoMapEngine: Stage 3: RAG Matching
29/10/2025 03:15:49 AM - INFO - OntoMapEngine: S2 result columns: ['original_value', 'updated_value', 'curated_ontology', 'match_level', 'top1_match', 'top1_score', 'top2_match', 'top2_score', 'top3_match', 'top3_score', 'top4_match', 'top4_score', 'top5_match', 'top5_score', 'stage']
29/10/2025 03:15:49 AM - INFO - OntoMapEngine: S2 result top1_score dtype: object
29/10/2025 03:15:49 AM - INFO - OntoMapEngine: S2 result top1_score unique values (first 10): ['0.9152' '0.9438' '0.8933' '0.9847' '0.9639' '0.9459' '0.9902' '0.9771'
 '0.9838' '0.9519']
29/10/2025 03:15:49 AM - INFO - OntoMapEngine: Queries with top1_score < 0.9: 65
29/10/2025 03:15:49 AM - INFO - OntoMapRAG: Initialized OntoMapRAG module
29/10/2025 03:15:49 AM - INFO - OntoMapRAG: Generating results table


Processing queries:   0%|          | 0/65 [00:00<?, ?it/s]

29/10/2025 03:15:50 AM - INFO - FAISSSQLiteSearch: All corpus terms already processed.
29/10/2025 03:15:50 AM - INFO - OntoMapRAG: True - Vector store initialized for method=pubmed-bert, category=disease, om_strategy=rag


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
Processing queries:   2%|▏         | 1/65 [00:00<00:44,  1.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:   8%|▊         | 5/65 [00:00<00:07,  7.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  14%|█▍        | 9/65 [00:00<00:04, 13.73it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  18%|█▊        | 12/65 [00:01<00:03, 17.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  23%|██▎       | 15/65 [00:01<00:02, 19.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  29%|██▉       | 19/65 [00:01<00:01, 23.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  35%|███▌      | 23/65 [00:01<00:01, 26.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  42%|████▏     | 27/65 [00:01<00:01, 28.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  48%|████▊     | 31/65 [00:01<00:01, 29.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  54%|█████▍    | 35/65 [00:01<00:00, 30.96it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  60%|██████    | 39/65 [00:01<00:00, 30.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  66%|██████▌   | 43/65 [00:01<00:00, 31.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  72%|███████▏  | 47/65 [00:02<00:00, 31.97it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  78%|███████▊  | 51/65 [00:02<00:00, 30.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  85%|████████▍ | 55/65 [00:02<00:00, 31.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  91%|█████████ | 59/65 [00:02<00:00, 32.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  97%|█████████▋| 63/65 [00:02<00:00, 31.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries: 100%|██████████| 65/65 [00:02<00:00, 24.12it/s]

29/10/2025 03:15:52 AM - INFO - OntoMapRAG: Results Generated
29/10/2025 03:15:52 AM - INFO - OntoMapEngine: Stage 3 completed: 65 queries
29/10/2025 03:15:52 AM - INFO - OntoMapEngine: FINAL SUMMARY
29/10/2025 03:15:52 AM - INFO - OntoMapEngine: Stage 1 (Exact): 341 queries
29/10/2025 03:15:52 AM - INFO - OntoMapEngine: Stage 2 (ST): 1149 queries
29/10/2025 03:15:52 AM - INFO - OntoMapEngine: Stage 3 (RAG): 65 queries





In [None]:
st_rag_pubmedbert_disease_top5_eval = calc.calc_accuracy(
    st_rag_pubmedbert_disease_top5_result)
print(st_rag_pubmedbert_disease_top5_eval)

  Accuracy Level   Accuracy
0    Top 1 Match  72.990354
1  Top 3 Matches  81.221865
2  Top 5 Matches  84.630225


In [10]:
st_rag_pubmedbert_disease_top5_result.to_csv(
    "data/outputs/2025/large_corpus/1024/st_rag_pubmedbert_disease_top5_result.csv",
    index=False)

In [9]:
# rag_bie Strategy: Need corpus_df for concept retrieval and query_df for query enrichment.

# Example: 

# Note: rag_bie is a query-enriched variant of RAG, so we have to use query with expanded fields.
query_df = pd.read_csv("data/corpus/cbio_disease/query_with_selected_fields_for_bie.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = query_df['original_cancer_type_value'].tolist() # TODO: use a common schema for all strategies.
large_corpus_list = large_corpus['official_label'].tolist() 

cura_map = dict(zip(query_df['original_cancer_type_value'], query_df['official_label']))

# run rag_bie strategy:
other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      s2_strategy='st',
                                      s3_strategy='rag_bie',
                                      s3_threshold=0.95,
                                      corpus_df=large_corpus,
                                      query_df=query_df,
                                      **other_params)
pubmedbert_rag_bie_result = onto_engine_large.run()

29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Stage 1: Exact matching
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Stage 2: ST
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Stage 3: RAG_BIE (threshold=0.95)
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Starting Ontology Mapping
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Exact matches: 66
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Remaining for Stage 2: 92
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Stage 2: ST Matching
29/10/2025 03:22:41 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
29/10/2025 03:22:41 AM - INFO - OntoMapST: Initialized OntoMap Sentence Transformer module
29/10/2025 03:22:46 AM - INFO - OntoMapEngine: Stage 2 completed: 92 queries
29/10/2025 03:22:46 AM - INFO - OntoMapEngine: Stage 3: RAG_BIE Matching
29/10/2025 03:22:46 AM - I

  return forward_call(*args, **kwargs)


29/10/2025 03:22:46 AM - INFO - OntoMapBIE: Initialized Bi-Encoder (query with context) module


Adding context to query_df: 100%|██████████| 214/214 [01:03<00:00,  3.38it/s]
Processing queries (Bi-Encoder):   0%|          | 0/214 [00:00<?, ?it/s]

29/10/2025 03:23:50 AM - INFO - FAISSSQLiteSearch: 13927 new terms to add to the index.
29/10/2025 03:23:50 AM - INFO - FAISSSQLiteSearch: Using provided DataFrame to update term-code pairs.
29/10/2025 03:23:50 AM - INFO - FAISSSQLiteSearch: Using provided DataFrame to fetch term-code pairs.
29/10/2025 03:23:50 AM - INFO - FAISSSQLiteSearch: Retrieved codes for 13924 terms
29/10/2025 03:23:50 AM - INFO - FAISSSQLiteSearch: Fetching concept data for 13927 unique codes
29/10/2025 03:23:50 AM - INFO - NCIDb: Fetching concept data for 13927 codes in batches of 50
29/10/2025 03:23:54 AM - INFO - NCIDb: Processed batch 1 of 279
29/10/2025 03:23:55 AM - INFO - NCIDb: Processed batch 2 of 279
29/10/2025 03:24:00 AM - INFO - NCIDb: Processed batch 4 of 279
29/10/2025 03:24:03 AM - INFO - NCIDb: Processed batch 5 of 279
29/10/2025 03:24:06 AM - INFO - NCIDb: Processed batch 6 of 279
29/10/2025 03:24:06 AM - INFO - NCIDb: Processed batch 3 of 279
29/10/2025 03:24:09 AM - INFO - NCIDb: Processed b

Building context and records: 100%|██████████| 13924/13924 [00:01<00:00, 12066.97it/s]

29/10/2025 03:37:26 AM - INFO - FAISSSQLiteSearch: Inserting 13846 records into SQLite





29/10/2025 03:37:27 AM - INFO - FAISSSQLiteSearch: Starting vector embedding and FAISS index insertion




Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/32 [00:00<?, ?it/s]



Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding batches: 100%|██████████| 28/28 [01:31<00:00,  3.25s/it]

29/10/2025 03:38:58 AM - INFO - FAISSSQLiteSearch: Finished fetching and storing all terms.
29/10/2025 03:38:58 AM - INFO - OntoMapBIE: True - Vector store initialized for method=pubmed-bert, category=disease, om_strategy=rag_bie





Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   0%|          | 1/214 [15:08<53:45:31, 908.60s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   2%|▏         | 5/214 [15:08<7:51:28, 135.35s/it] 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   4%|▍         | 9/214 [15:08<3:28:45, 61.10s/it] 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   6%|▌         | 13/214 [15:08<1:54:47, 34.27s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   8%|▊         | 17/214 [15:09<1:09:09, 21.06s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  10%|▉         | 21/214 [15:09<43:43, 13.59s/it]  

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  12%|█▏        | 25/214 [15:09<28:25,  9.03s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  14%|█▎        | 29/214 [15:09<18:49,  6.11s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  15%|█▌        | 33/214 [15:09<12:36,  4.18s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  17%|█▋        | 37/214 [15:09<08:31,  2.89s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  19%|█▉        | 41/214 [15:09<05:47,  2.01s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  21%|██        | 45/214 [15:09<03:57,  1.40s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  23%|██▎       | 49/214 [15:10<02:42,  1.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  25%|██▍       | 53/214 [15:10<01:52,  1.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  27%|██▋       | 57/214 [15:10<01:18,  2.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  29%|██▊       | 61/214 [15:10<00:54,  2.80it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  30%|███       | 65/214 [15:10<00:38,  3.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  32%|███▏      | 69/214 [15:10<00:27,  5.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  34%|███▍      | 73/214 [15:10<00:20,  7.03it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  36%|███▌      | 77/214 [15:11<00:14,  9.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  38%|███▊      | 81/214 [15:11<00:11, 11.68it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  40%|███▉      | 85/214 [15:11<00:08, 14.66it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  42%|████▏     | 89/214 [15:11<00:07, 17.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  43%|████▎     | 93/214 [15:11<00:05, 20.91it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  45%|████▌     | 97/214 [15:11<00:04, 23.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  47%|████▋     | 101/214 [15:11<00:04, 26.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  49%|████▉     | 105/214 [15:11<00:04, 25.81it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  51%|█████     | 109/214 [15:11<00:03, 26.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  53%|█████▎    | 113/214 [15:12<00:03, 28.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  55%|█████▍    | 117/214 [15:12<00:03, 29.19it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  57%|█████▋    | 121/214 [15:12<00:02, 31.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  58%|█████▊    | 125/214 [15:12<00:02, 32.24it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  60%|██████    | 129/214 [15:12<00:02, 33.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  62%|██████▏   | 133/214 [15:12<00:02, 34.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  64%|██████▍   | 137/214 [15:12<00:02, 34.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  66%|██████▌   | 141/214 [15:12<00:02, 32.74it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  68%|██████▊   | 145/214 [15:13<00:02, 33.29it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  70%|██████▉   | 149/214 [15:13<00:01, 32.70it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  71%|███████▏  | 153/214 [15:13<00:01, 33.31it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  73%|███████▎  | 157/214 [15:13<00:01, 32.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  75%|███████▌  | 161/214 [15:13<00:01, 33.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  77%|███████▋  | 165/214 [15:13<00:01, 32.72it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  79%|███████▉  | 169/214 [15:13<00:01, 31.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  81%|████████  | 173/214 [15:13<00:01, 32.93it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  83%|████████▎ | 177/214 [15:14<00:01, 33.15it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  85%|████████▍ | 181/214 [15:14<00:01, 31.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  86%|████████▋ | 185/214 [15:14<00:00, 30.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  88%|████████▊ | 189/214 [15:14<00:00, 29.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  90%|█████████ | 193/214 [15:14<00:00, 29.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  92%|█████████▏| 196/214 [15:14<00:00, 27.42it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  93%|█████████▎| 199/214 [15:14<00:00, 27.18it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  95%|█████████▍| 203/214 [15:14<00:00, 28.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  97%|█████████▋| 207/214 [15:15<00:00, 29.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  99%|█████████▊| 211/214 [15:15<00:00, 30.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder): 100%|██████████| 214/214 [15:15<00:00,  4.28s/it]

29/10/2025 03:39:05 AM - INFO - OntoMapBIE: Bi-Encoder Results Generated
29/10/2025 03:39:05 AM - INFO - OntoMapEngine: Stage 3 completed: 20 queries





29/10/2025 03:39:05 AM - INFO - OntoMapEngine: FINAL SUMMARY
29/10/2025 03:39:05 AM - INFO - OntoMapEngine: Stage 1 (Exact): 66 queries
29/10/2025 03:39:05 AM - INFO - OntoMapEngine: Stage 2 (ST): 78 queries
29/10/2025 03:39:05 AM - INFO - OntoMapEngine: Stage 3 (RAG_BIE): 20 queries


In [10]:
pubmedbert_rag_bie_accuracy_df = calc.calc_accuracy(pubmedbert_rag_bie_result)
print(pubmedbert_rag_bie_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  76.219512
1  Top 3 Matches  77.439024
2  Top 5 Matches  82.926829


Workflow Code

In [12]:
from graphviz import Digraph

dot = Digraph(comment='3-Stage Ontology Mapping Workflow')
dot.attr(rankdir='TB', fontsize='10', fontname='Helvetica')

# Input
dot.node('A', 'Input Arguments\n(query, corpus, s2_strategy, s3_strategy, s3_threshold)', shape='box', style='filled', fillcolor='lightgray')

# ========== Stage 1 ==========
with dot.subgraph(name='cluster_s1') as s1:
    s1.attr(label='Stage 1: Exact Matching', style='filled', fillcolor='lightblue', fontsize='11')
    s1.node('S1_EXACT', 'Exact Match?', shape='diamond')
    s1.node('S1_MATCHED', 'Exact Matched Terms\n(stage=1)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Stage 2 ==========
with dot.subgraph(name='cluster_s2') as s2:
    s2.attr(label='Stage 2: LM/ST (Transformer-based)', style='filled', fillcolor='lightyellow', fontsize='11')
    s2.node('S2_ABBR', 'Replace Abbreviations\n(via abbreviation dict)', shape='box')
    s2.node('S2_STRATEGY', 'Choose s2_strategy:\nLM or ST', shape='box', style='filled', fillcolor='orange')
    s2.node('S2_FAISS', 'Check FAISS Index &\nSQLite Table', shape='box')
    s2.node('S2_BUILD', 'Encode Corpus Terms\n→ Store in FAISS & SQLite', shape='box')
    s2.node('S2_CHECK', 'Check Completeness\n→ Append Missing if Needed', shape='box')
    s2.node('S2_ENCODE', 'Encode Query Terms', shape='box', style='filled', fillcolor='lightyellow')
    s2.node('S2_SEARCH', 'FAISS Search\n(Get top-k matches + scores)', shape='ellipse', style='filled', fillcolor='orange')
    s2.node('S2_RESULT', 'Stage 2 Results\n(stage=2, with top1_score)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Stage 3 ==========
with dot.subgraph(name='cluster_s3') as s3:
    s3.attr(label='Stage 3: RAG/RAG_BIE (Optional, for low-confidence queries)', style='filled', fillcolor='lightcoral', fontsize='11')
    s3.node('S3_CHECK', 'top1_score < s3_threshold?', shape='diamond')
    s3.node('S3_SKIP', 'Skip Stage 3', shape='box', style='dashed')
    s3.node('S3_ABBR', 'Replace Abbreviations\nfor low-confidence queries', shape='box')
    s3.node('S3_STRATEGY', 'Choose s3_strategy:\nRAG or RAG_BIE', shape='box', style='filled', fillcolor='orange')
    s3.node('S3_FAISS', 'Check FAISS Index &\nSQLite Table', shape='box')
    s3.node('S3_BUILD', 'Encode Corpus Contexts\n→ Store in FAISS & SQLite', shape='box')
    s3.node('S3_CHECK_DB', 'Check Completeness\n→ Append Missing if Needed', shape='box')
    s3.node('S3_ENCODE_RAG', 'RAG:\nEncode Query Terms', shape='box', style='filled', fillcolor='lightcoral')
    s3.node('S3_ENCODE_RAGBIE', 'RAG_BIE:\nEnrich + Encode Query', shape='box', style='filled', fillcolor='lightcoral')
    s3.node('S3_SEARCH', 'FAISS Search\n(Get top-k matches + scores)', shape='ellipse', style='filled', fillcolor='orange')
    s3.node('S3_RESULT', 'Stage 3 Results\n(stage=3, overrides low-confidence S2 results)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Final Merge ==========
dot.node('MERGE', 'Merge All Stages:\nStage 1 + Stage 2 (filtered) + Stage 3', shape='box', style='filled', fillcolor='lightgreen')
dot.node('OUTPUT', 'Final Output DataFrame\n(with stage column: 1, 2, or 3)', shape='box', style='filled', fillcolor='green')

# ========== Flow Edges ==========
# Input to Stage 1
dot.edge('A', 'S1_EXACT')

# Stage 1 flow
dot.edge('S1_EXACT', 'S1_MATCHED', label='Yes')
dot.edge('S1_EXACT', 'S2_ABBR', label='No\n(unmatched queries)')

# Stage 2 flow
dot.edge('S2_ABBR', 'S2_STRATEGY')
dot.edge('S2_STRATEGY', 'S2_FAISS')
dot.edge('S2_FAISS', 'S2_BUILD', label='Not Exist')
dot.edge('S2_FAISS', 'S2_CHECK', label='Exist')
dot.edge('S2_BUILD', 'S2_ENCODE')
dot.edge('S2_CHECK', 'S2_ENCODE')
dot.edge('S2_ENCODE', 'S2_SEARCH')
dot.edge('S2_SEARCH', 'S2_RESULT')

# Stage 2 to Stage 3 decision
dot.edge('S2_RESULT', 'S3_CHECK')
dot.edge('S3_CHECK', 'S3_SKIP', label='No / s3_strategy=None')
dot.edge('S3_CHECK', 'S3_ABBR', label='Yes')

# Stage 3 flow
dot.edge('S3_ABBR', 'S3_STRATEGY')
dot.edge('S3_STRATEGY', 'S3_FAISS')
dot.edge('S3_FAISS', 'S3_BUILD', label='Not Exist')
dot.edge('S3_FAISS', 'S3_CHECK_DB', label='Exist')
dot.edge('S3_BUILD', 'S3_ENCODE_RAG', label='RAG')
dot.edge('S3_BUILD', 'S3_ENCODE_RAGBIE', label='RAG_BIE')
dot.edge('S3_CHECK_DB', 'S3_ENCODE_RAG', label='RAG')
dot.edge('S3_CHECK_DB', 'S3_ENCODE_RAGBIE', label='RAG_BIE')
dot.edge('S3_ENCODE_RAG', 'S3_SEARCH')
dot.edge('S3_ENCODE_RAGBIE', 'S3_SEARCH')
dot.edge('S3_SEARCH', 'S3_RESULT')

# Merge all stages
dot.edge('S1_MATCHED', 'MERGE')
dot.edge('S2_RESULT', 'MERGE', label='(if S3 skipped)')
dot.edge('S3_SKIP', 'MERGE')
dot.edge('S3_RESULT', 'MERGE')

# Final output
dot.edge('MERGE', 'OUTPUT')

# Note
dot.node('NOTE', 'Note:\n• Stage 1: Exact matching (always runs)\n• Stage 2: LM/ST transformer matching (always runs)\n• Stage 3: RAG/RAG_BIE context-based matching\n  (only for queries with top1_score < s3_threshold)\n• RAG_BIE enriches queries before encoding', 
         shape='note', style='dashed', fontsize='9', fontcolor='slategray')
dot.edge('OUTPUT', 'NOTE', style='invis')

# Render
dot.render('ontology_mapping_3stage_workflow', format='png', cleanup=True)
print("Flowchart saved as 'ontology_mapping_3stage_workflow.png'")

Flowchart saved as 'ontology_mapping_3stage_workflow.png'
