In [1]:
# First, change the working directory to the project root.
# Example: %cd /home/your_username/MetaHarmonizer

%cd /home/lcc/projects/MetaHarmonizer

/home/lcc/projects/MetaHarmonizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
# Required files:
# data/corpus/oncotree_code_to_name.csv

In [2]:
# This is required to run asyncio code in Jupyter notebooks.
# Jupyter already runs its own event loop, so calling asyncio.run() directly would raise an error.
# nest_asyncio.apply() patches the loop to allow nested usage.

import nest_asyncio

nest_asyncio.apply()

In [3]:
# Import core modules and ontology mapper implementations

import pandas as pd

# Import the engine that handles pipeline logic and integrates the mappers
from src.Engine import get_ontology_engine

OntoMapEngine = get_ontology_engine()

In [4]:
# Import the CalcStats class for calculating Top1, Top3, and Top5 accuracy

from evaluation.calc_stats import CalcStats

calc = CalcStats()

In [None]:
# Optional utility: Clean up the FAISS + SQLite vector store
# Useful after testing or re-running experiments to avoid stale data
# Not required for standard inference or training runs

from src.utils.cleanup_vector_store import cleanup_vector_store

cleanup_vector_store("st", "mt-sap-bert", "disease")  # (strategy, model_name, entity_type)

[Success] Table 'st_mt_sap_bert_disease' dropped from src/KnowledgeDb/vector_db.sqlite
[Success] Index file 'src/KnowledgeDb/faiss_indexes/st_mt-sap-bert_disease.index' deleted.


DF

In [5]:
df = pd.read_csv("data/corpus/cbio_disease/disease_query_updated.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = df['original_value'].tolist()
small_corpus_list = df['curated_ontology'].tolist() # Small corpus list is made of curated ontology values in the query file
large_corpus_list = (
    large_corpus["official_label"].tolist()
    if "official_label" in large_corpus.columns
    else large_corpus["label"].tolist()
)

cura_map = dict(zip(df['original_value'], df['curated_ontology']))

In [6]:
# ST Strategy: use default pooling method. LM Strategy: use CLS token pooling method.
# Arguments:
# - method
# - category
# - topk: retrieve the top k matches
# - query: list of original values to map
# - corpus: list of curated ontology values to match against
# - cura_map: dictionary mapping original values to curated ontology values
# Returns: DF with original values, curated ontology values, match levels, stage, and top k matches with scores

other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='mt-sap-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      s2_strategy="st",
                                      s3_strategy=None,
                                      **other_params)
st_sapbert_disease_top5_result = onto_engine_large.run()

31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Stage 1: Exact matching
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Stage 2: ST
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Stage 3: Disabled
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Starting Ontology Mapping
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Exact matches: 342
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Remaining for Stage 2: 1213
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Stage 2: ST Matching
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
31/10/2025 02:04:37 AM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
31/10/2025 02:04:

No sentence-transformers model found with name model_cache/mt-sap-bert. Creating a new one with mean pooling.
No sentence-transformers model found with name cambridgeltl/SapBERT-from-PubMedBERT-fulltext-mean-token. Creating a new one with mean pooling.
  return forward_call(*args, **kwargs)


31/10/2025 02:04:43 AM - INFO - OntoMapEngine: Stage 2 completed: 1213 queries
31/10/2025 02:04:43 AM - INFO - OntoMapEngine: Stage 3: Disabled
31/10/2025 02:04:43 AM - INFO - OntoMapEngine: FINAL SUMMARY
31/10/2025 02:04:43 AM - INFO - OntoMapEngine: Stage 1 (Exact): 342 queries
31/10/2025 02:04:43 AM - INFO - OntoMapEngine: Stage 2 (ST): 1213 queries


In [7]:
# Calculate Top1, Top3, and Top5 accuracy for the generated results

st_sapbert_accuracy_df = calc.calc_accuracy(st_sapbert_disease_top5_result)
print(st_sapbert_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  75.562701
1  Top 3 Matches  84.694534
2  Top 5 Matches  87.781350


In [8]:
# Save the results to a CSV file for further analysis or reporting. Optional.

st_sapbert_disease_top5_result.to_csv(
    "data/outputs/2025/large_corpus/1024/st_sapbert_disease_top5_result.csv",
    index=False)

In [8]:
# RAG Strategy: Need corpus_df for concept retrieval.
# Example: 
other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      corpus_df=large_corpus,
                                      s2_strategy="st",
                                      s3_strategy="rag",
                                      s3_threshold=0.9,
                                      **other_params)
st_rag_pubmedbert_disease_top5_result = onto_engine_large.run()

31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Stage 1: Exact matching
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Stage 2: ST
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Stage 3: RAG (threshold=0.9)
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Starting Ontology Mapping
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Exact matches: 341
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Remaining for Stage 2: 1214
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Stage 2: ST Matching
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
31/10/2025 02:05:03 AM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
31/10/

  return forward_call(*args, **kwargs)


31/10/2025 02:05:08 AM - INFO - OntoMapEngine: Stage 2 completed: 1214 queries
31/10/2025 02:05:08 AM - INFO - OntoMapEngine: Stage 3: RAG Matching
31/10/2025 02:05:08 AM - INFO - OntoMapEngine: S2 result columns: ['original_value', 'updated_value', 'curated_ontology', 'match_level', 'top1_match', 'top1_score', 'top2_match', 'top2_score', 'top3_match', 'top3_score', 'top4_match', 'top4_score', 'top5_match', 'top5_score', 'stage']
31/10/2025 02:05:08 AM - INFO - OntoMapEngine: S2 result top1_score dtype: object
31/10/2025 02:05:08 AM - INFO - OntoMapEngine: S2 result top1_score unique values (first 10): ['0.9152' '0.9438' '0.8933' '0.9847' '0.9639' '0.9459' '0.9902' '0.9771'
 '0.9838' '0.9519']
31/10/2025 02:05:08 AM - INFO - OntoMapEngine: Queries with top1_score < 0.9: 65
31/10/2025 02:05:08 AM - INFO - OntoMapRAG: Initialized OntoMapRAG module
31/10/2025 02:05:08 AM - INFO - OntoMapRAG: Generating results table


Processing queries:   0%|          | 0/65 [00:00<?, ?it/s]

31/10/2025 02:05:09 AM - INFO - FAISSSQLiteSearch: All corpus terms already processed.
31/10/2025 02:05:09 AM - INFO - OntoMapRAG: True - Vector store initialized for method=pubmed-bert, category=disease, om_strategy=rag


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
Processing queries:   2%|▏         | 1/65 [00:00<00:21,  3.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:   8%|▊         | 5/65 [00:00<00:04, 13.87it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  14%|█▍        | 9/65 [00:00<00:02, 21.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  20%|██        | 13/65 [00:00<00:01, 26.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  26%|██▌       | 17/65 [00:00<00:01, 29.46it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  32%|███▏      | 21/65 [00:00<00:01, 31.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  38%|███▊      | 25/65 [00:00<00:01, 33.20it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  45%|████▍     | 29/65 [00:01<00:01, 33.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  51%|█████     | 33/65 [00:01<00:00, 33.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  57%|█████▋    | 37/65 [00:01<00:00, 31.58it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  63%|██████▎   | 41/65 [00:01<00:00, 31.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  69%|██████▉   | 45/65 [00:01<00:00, 32.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  75%|███████▌  | 49/65 [00:01<00:00, 33.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  82%|████████▏ | 53/65 [00:01<00:00, 35.28it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  88%|████████▊ | 57/65 [00:01<00:00, 35.09it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries:  94%|█████████▍| 61/65 [00:02<00:00, 34.82it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

                                                                   

31/10/2025 02:05:10 AM - INFO - OntoMapRAG: Results Generated
31/10/2025 02:05:10 AM - INFO - OntoMapEngine: Stage 3 completed: 65 queries
31/10/2025 02:05:10 AM - INFO - OntoMapEngine: FINAL SUMMARY
31/10/2025 02:05:10 AM - INFO - OntoMapEngine: Stage 1 (Exact): 341 queries
31/10/2025 02:05:10 AM - INFO - OntoMapEngine: Stage 2 (ST): 1149 queries
31/10/2025 02:05:10 AM - INFO - OntoMapEngine: Stage 3 (RAG): 65 queries




In [9]:
st_rag_pubmedbert_disease_top5_eval = calc.calc_accuracy(
    st_rag_pubmedbert_disease_top5_result)
print(st_rag_pubmedbert_disease_top5_eval)

  Accuracy Level   Accuracy
0    Top 1 Match  72.990354
1  Top 3 Matches  81.221865
2  Top 5 Matches  84.630225


In [10]:
st_rag_pubmedbert_disease_top5_result.to_csv(
    "data/outputs/2025/large_corpus/1024/st_rag_pubmedbert_disease_top5_result.csv",
    index=False)

In [12]:
# rag_bie Strategy: Need corpus_df for concept retrieval and query_df for query enrichment.

# Example: 

# Note: rag_bie is a query-enriched variant of RAG, so we have to use query with expanded fields.
query_df = pd.read_csv("data/corpus/cbio_disease/query_with_selected_fields_for_bie.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = query_df['original_cancer_type_value'].tolist() # TODO: use a common schema for all strategies.
large_corpus_list = large_corpus['official_label'].tolist() 

cura_map = dict(zip(query_df['original_cancer_type_value'], query_df['official_label']))

# run rag_bie strategy:
other_params = {"test_or_prod": "test"}
onto_engine_large = OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      s2_strategy='st',
                                      s3_strategy='rag_bie',
                                      s3_threshold=0.95,
                                      corpus_df=large_corpus,
                                      query_df=query_df,
                                      **other_params)
pubmedbert_rag_bie_result = onto_engine_large.run()

31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Initialized OntoMap Engine
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Stage 1: Exact matching
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Stage 2: ST
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Stage 3: RAG_BIE (threshold=0.95)
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Starting Ontology Mapping
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Stage 1: Exact Matching
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Exact matches: 66
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Remaining for Stage 2: 92
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Stage 2: ST Matching
31/10/2025 02:09:14 AM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
31/10/2025 02:09:14 AM - INFO - OntoMapST: Initialized OntoMap Sentence Transformer module


  return forward_call(*args, **kwargs)


31/10/2025 02:09:17 AM - INFO - OntoMapEngine: Stage 2 completed: 92 queries
31/10/2025 02:09:17 AM - INFO - OntoMapEngine: Stage 3: RAG_BIE Matching
31/10/2025 02:09:17 AM - INFO - OntoMapEngine: S2 result columns: ['original_value', 'updated_value', 'curated_ontology', 'match_level', 'top1_match', 'top1_score', 'top2_match', 'top2_score', 'top3_match', 'top3_score', 'top4_match', 'top4_score', 'top5_match', 'top5_score', 'top6_match', 'top6_score', 'top7_match', 'top7_score', 'top8_match', 'top8_score', 'top9_match', 'top9_score', 'top10_match', 'top10_score', 'top11_match', 'top11_score', 'top12_match', 'top12_score', 'top13_match', 'top13_score', 'top14_match', 'top14_score', 'top15_match', 'top15_score', 'top16_match', 'top16_score', 'top17_match', 'top17_score', 'top18_match', 'top18_score', 'top19_match', 'top19_score', 'top20_match', 'top20_score', 'stage']
31/10/2025 02:09:17 AM - INFO - OntoMapEngine: S2 result top1_score dtype: object
31/10/2025 02:09:17 AM - INFO - OntoMapE

Adding context to query_df: 100%|██████████| 214/214 [01:03<00:00,  3.38it/s]
Processing queries (Bi-Encoder):   0%|          | 0/214 [00:00<?, ?it/s]

31/10/2025 02:10:20 AM - INFO - FAISSSQLiteSearch: All corpus terms already processed.
31/10/2025 02:10:20 AM - INFO - OntoMapBIE: True - Vector store initialized for method=pubmed-bert, category=disease, om_strategy=rag_bie


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
Processing queries (Bi-Encoder):   0%|          | 1/214 [00:00<00:42,  5.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   1%|▏         | 3/214 [00:00<00:24,  8.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   2%|▏         | 5/214 [00:00<00:17, 11.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   4%|▍         | 9/214 [00:00<00:10, 20.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   7%|▋         | 14/214 [00:00<00:07, 26.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):   8%|▊         | 18/214 [00:00<00:06, 28.29it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  10%|█         | 22/214 [00:00<00:06, 30.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  12%|█▏        | 26/214 [00:01<00:06, 30.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  14%|█▍        | 30/214 [00:01<00:05, 32.01it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  16%|█▌        | 34/214 [00:01<00:05, 32.76it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  18%|█▊        | 38/214 [00:01<00:05, 33.12it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  20%|██        | 43/214 [00:01<00:04, 35.55it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  22%|██▏       | 47/214 [00:01<00:04, 36.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  24%|██▍       | 51/214 [00:01<00:04, 36.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  26%|██▌       | 55/214 [00:01<00:04, 33.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  28%|██▊       | 60/214 [00:02<00:04, 35.94it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  30%|██▉       | 64/214 [00:02<00:04, 36.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  32%|███▏      | 68/214 [00:02<00:04, 35.89it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  34%|███▎      | 72/214 [00:02<00:03, 35.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  36%|███▌      | 76/214 [00:02<00:03, 36.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  38%|███▊      | 81/214 [00:02<00:03, 37.41it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  40%|███▉      | 85/214 [00:02<00:03, 36.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  42%|████▏     | 89/214 [00:02<00:03, 35.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  43%|████▎     | 93/214 [00:02<00:03, 35.92it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  45%|████▌     | 97/214 [00:03<00:03, 35.29it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  47%|████▋     | 101/214 [00:03<00:03, 36.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  49%|████▉     | 105/214 [00:03<00:02, 36.34it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  51%|█████▏    | 110/214 [00:03<00:02, 37.21it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  53%|█████▎    | 114/214 [00:03<00:02, 36.90it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  55%|█████▌    | 118/214 [00:03<00:02, 36.52it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  57%|█████▋    | 122/214 [00:03<00:02, 35.88it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  59%|█████▉    | 126/214 [00:03<00:02, 36.83it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  61%|██████    | 130/214 [00:03<00:02, 35.79it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  63%|██████▎   | 134/214 [00:04<00:02, 34.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  64%|██████▍   | 138/214 [00:04<00:02, 34.71it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  67%|██████▋   | 143/214 [00:04<00:01, 36.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  69%|██████▊   | 147/214 [00:04<00:01, 36.98it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  71%|███████   | 151/214 [00:04<00:01, 37.02it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  72%|███████▏  | 155/214 [00:04<00:01, 36.22it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  74%|███████▍  | 159/214 [00:04<00:01, 33.67it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  76%|███████▌  | 163/214 [00:04<00:01, 33.63it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  78%|███████▊  | 167/214 [00:05<00:01, 34.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  80%|███████▉  | 171/214 [00:05<00:01, 35.10it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  82%|████████▏ | 175/214 [00:05<00:01, 35.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  84%|████████▎ | 179/214 [00:05<00:00, 35.13it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  86%|████████▌ | 183/214 [00:05<00:00, 34.04it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  87%|████████▋ | 187/214 [00:05<00:00, 35.46it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  89%|████████▉ | 191/214 [00:05<00:00, 35.38it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  91%|█████████ | 195/214 [00:05<00:00, 35.00it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  93%|█████████▎| 199/214 [00:05<00:00, 35.39it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  95%|█████████▍| 203/214 [00:06<00:00, 33.08it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  97%|█████████▋| 207/214 [00:06<00:00, 33.44it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Processing queries (Bi-Encoder):  99%|█████████▊| 211/214 [00:06<00:00, 33.11it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

                                                                                  

31/10/2025 02:10:26 AM - INFO - OntoMapBIE: Bi-Encoder Results Generated
31/10/2025 02:10:26 AM - INFO - OntoMapEngine: Stage 3 completed: 20 queries
31/10/2025 02:10:26 AM - INFO - OntoMapEngine: FINAL SUMMARY
31/10/2025 02:10:26 AM - INFO - OntoMapEngine: Stage 1 (Exact): 66 queries
31/10/2025 02:10:26 AM - INFO - OntoMapEngine: Stage 2 (ST): 78 queries
31/10/2025 02:10:26 AM - INFO - OntoMapEngine: Stage 3 (RAG_BIE): 20 queries




In [13]:
pubmedbert_rag_bie_accuracy_df = calc.calc_accuracy(pubmedbert_rag_bie_result)
print(pubmedbert_rag_bie_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  76.219512
1  Top 3 Matches  77.439024
2  Top 5 Matches  82.926829


Workflow Code

In [12]:
from graphviz import Digraph

dot = Digraph(comment='3-Stage Ontology Mapping Workflow')
dot.attr(rankdir='TB', fontsize='10', fontname='Helvetica')

# Input
dot.node('A', 'Input Arguments\n(query, corpus, s2_strategy, s3_strategy, s3_threshold)', shape='box', style='filled', fillcolor='lightgray')

# ========== Stage 1 ==========
with dot.subgraph(name='cluster_s1') as s1:
    s1.attr(label='Stage 1: Exact Matching', style='filled', fillcolor='lightblue', fontsize='11')
    s1.node('S1_EXACT', 'Exact Match?', shape='diamond')
    s1.node('S1_MATCHED', 'Exact Matched Terms\n(stage=1)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Stage 2 ==========
with dot.subgraph(name='cluster_s2') as s2:
    s2.attr(label='Stage 2: LM/ST (Transformer-based)', style='filled', fillcolor='lightyellow', fontsize='11')
    s2.node('S2_ABBR', 'Replace Abbreviations\n(via abbreviation dict)', shape='box')
    s2.node('S2_STRATEGY', 'Choose s2_strategy:\nLM or ST', shape='box', style='filled', fillcolor='orange')
    s2.node('S2_FAISS', 'Check FAISS Index &\nSQLite Table', shape='box')
    s2.node('S2_BUILD', 'Encode Corpus Terms\n→ Store in FAISS & SQLite', shape='box')
    s2.node('S2_CHECK', 'Check Completeness\n→ Append Missing if Needed', shape='box')
    s2.node('S2_ENCODE', 'Encode Query Terms', shape='box', style='filled', fillcolor='lightyellow')
    s2.node('S2_SEARCH', 'FAISS Search\n(Get top-k matches + scores)', shape='ellipse', style='filled', fillcolor='orange')
    s2.node('S2_RESULT', 'Stage 2 Results\n(stage=2, with top1_score)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Stage 3 ==========
with dot.subgraph(name='cluster_s3') as s3:
    s3.attr(label='Stage 3: RAG/RAG_BIE (Optional, for low-confidence queries)', style='filled', fillcolor='lightcoral', fontsize='11')
    s3.node('S3_CHECK', 'top1_score < s3_threshold?', shape='diamond')
    s3.node('S3_SKIP', 'Skip Stage 3', shape='box', style='dashed')
    s3.node('S3_ABBR', 'Replace Abbreviations\nfor low-confidence queries', shape='box')
    s3.node('S3_STRATEGY', 'Choose s3_strategy:\nRAG or RAG_BIE', shape='box', style='filled', fillcolor='orange')
    s3.node('S3_FAISS', 'Check FAISS Index &\nSQLite Table', shape='box')
    s3.node('S3_BUILD', 'Encode Corpus Contexts\n→ Store in FAISS & SQLite', shape='box')
    s3.node('S3_CHECK_DB', 'Check Completeness\n→ Append Missing if Needed', shape='box')
    s3.node('S3_ENCODE_RAG', 'RAG:\nEncode Query Terms', shape='box', style='filled', fillcolor='lightcoral')
    s3.node('S3_ENCODE_RAGBIE', 'RAG_BIE:\nEnrich + Encode Query', shape='box', style='filled', fillcolor='lightcoral')
    s3.node('S3_SEARCH', 'FAISS Search\n(Get top-k matches + scores)', shape='ellipse', style='filled', fillcolor='orange')
    s3.node('S3_RESULT', 'Stage 3 Results\n(stage=3, overrides low-confidence S2 results)', shape='box', style='filled', fillcolor='lightgreen')

# ========== Final Merge ==========
dot.node('MERGE', 'Merge All Stages:\nStage 1 + Stage 2 (filtered) + Stage 3', shape='box', style='filled', fillcolor='lightgreen')
dot.node('OUTPUT', 'Final Output DataFrame\n(with stage column: 1, 2, or 3)', shape='box', style='filled', fillcolor='green')

# ========== Flow Edges ==========
# Input to Stage 1
dot.edge('A', 'S1_EXACT')

# Stage 1 flow
dot.edge('S1_EXACT', 'S1_MATCHED', label='Yes')
dot.edge('S1_EXACT', 'S2_ABBR', label='No\n(unmatched queries)')

# Stage 2 flow
dot.edge('S2_ABBR', 'S2_STRATEGY')
dot.edge('S2_STRATEGY', 'S2_FAISS')
dot.edge('S2_FAISS', 'S2_BUILD', label='Not Exist')
dot.edge('S2_FAISS', 'S2_CHECK', label='Exist')
dot.edge('S2_BUILD', 'S2_ENCODE')
dot.edge('S2_CHECK', 'S2_ENCODE')
dot.edge('S2_ENCODE', 'S2_SEARCH')
dot.edge('S2_SEARCH', 'S2_RESULT')

# Stage 2 to Stage 3 decision
dot.edge('S2_RESULT', 'S3_CHECK')
dot.edge('S3_CHECK', 'S3_SKIP', label='No / s3_strategy=None')
dot.edge('S3_CHECK', 'S3_ABBR', label='Yes')

# Stage 3 flow
dot.edge('S3_ABBR', 'S3_STRATEGY')
dot.edge('S3_STRATEGY', 'S3_FAISS')
dot.edge('S3_FAISS', 'S3_BUILD', label='Not Exist')
dot.edge('S3_FAISS', 'S3_CHECK_DB', label='Exist')
dot.edge('S3_BUILD', 'S3_ENCODE_RAG', label='RAG')
dot.edge('S3_BUILD', 'S3_ENCODE_RAGBIE', label='RAG_BIE')
dot.edge('S3_CHECK_DB', 'S3_ENCODE_RAG', label='RAG')
dot.edge('S3_CHECK_DB', 'S3_ENCODE_RAGBIE', label='RAG_BIE')
dot.edge('S3_ENCODE_RAG', 'S3_SEARCH')
dot.edge('S3_ENCODE_RAGBIE', 'S3_SEARCH')
dot.edge('S3_SEARCH', 'S3_RESULT')

# Merge all stages
dot.edge('S1_MATCHED', 'MERGE')
dot.edge('S2_RESULT', 'MERGE', label='(if S3 skipped)')
dot.edge('S3_SKIP', 'MERGE')
dot.edge('S3_RESULT', 'MERGE')

# Final output
dot.edge('MERGE', 'OUTPUT')

# Note
dot.node('NOTE', 'Note:\n• Stage 1: Exact matching (always runs)\n• Stage 2: LM/ST transformer matching (always runs)\n• Stage 3: RAG/RAG_BIE context-based matching\n  (only for queries with top1_score < s3_threshold)\n• RAG_BIE enriches queries before encoding', 
         shape='note', style='dashed', fontsize='9', fontcolor='slategray')
dot.edge('OUTPUT', 'NOTE', style='invis')

# Render
dot.render('ontology_mapping_3stage_workflow', format='png', cleanup=True)
print("Flowchart saved as 'ontology_mapping_3stage_workflow.png'")

Flowchart saved as 'ontology_mapping_3stage_workflow.png'
