In [1]:
# First, change the working directory to the project root.
# Example: %cd /home/your_username/MetaHarmonizer

%cd /home/lcc/projects/MetaHarmonizer

/home/lcc/projects/MetaHarmonizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
# Required files:
# data/corpus/oncotree_code_to_name.csv

In [2]:
# This is required to run asyncio code in Jupyter notebooks.
# Jupyter already runs its own event loop, so calling asyncio.run() directly would raise an error.
# nest_asyncio.apply() patches the loop to allow nested usage.

import nest_asyncio

nest_asyncio.apply()

In [None]:
# Import core modules and ontology mapper implementations

import pandas as pd
from importlib import reload

# Import different ontology mapping strategies (ST, LM, RAG, Bi-Encoder)
from src.models import ontology_mapper_st as om_st
from src.models import ontology_mapper_lm as om_lm
from src.models import ontology_mapper_rag as om_rag
from src.models import ontology_mapper_bi_encoder as om_bi

# Import the engine that handles pipeline logic and integrates the mappers
from src.Engine import ontology_mapping_engine as ome

# Reload modules to reflect any code updates during development (useful in Jupyter)
reload(om_st)
reload(om_lm)
reload(om_rag)
reload(om_bi)
reload(ome)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/lcc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lcc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lcc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<module 'src.Engine.ontology_mapping_engine' from '/home/lcc/projects/MetaHarmonizer/src/Engine/ontology_mapping_engine.py'>

In [None]:
# Import the CalcStats class for calculating Top1, Top3, and Top5 accuracy

from evaluation.calc_stats import CalcStats

calc = CalcStats()

In [None]:
# Optional utility: Clean up the FAISS + SQLite vector store
# Useful after testing or re-running experiments to avoid stale data
# Not required for standard inference or training runs

from src.utils.cleanup_vector_store import cleanup_vector_store

cleanup_vector_store("st", "mt-sap-bert", "disease")  # (strategy, model_name, entity_type)

[Success] Table 'st_mt_sap_bert_disease' dropped from src/KnowledgeDb/vector_db.sqlite
[Success] Index file 'src/KnowledgeDb/faiss_indexes/st_mt-sap-bert_disease.index' deleted.


DF

In [5]:
df = pd.read_csv("data/corpus/cbio_disease/disease_query_updated.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = df['original_value'].tolist()
small_corpus_list = df['curated_ontology'].tolist() # Small corpus list is made of curated ontology values in the query file
large_corpus_list = large_corpus['official_label'].tolist() 

cura_map = dict(zip(df['original_value'], df['curated_ontology']))

In [None]:
# ST Strategy: use default pooling method. LM Strategy: use CLS token pooling method.
# Arguments:
# - method
# - category
# - topk: retrieve the top k matches
# - query: list of original values to map
# - corpus: list of curated ontology values to match against
# - cura_map: dictionary mapping original values to curated ontology values
# Returns: DF with original values, curated ontology values, match levels, stage, and top k matches with scores

other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='mt-sap-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='st',
                                      **other_params)
st_sapbert_disease_top20_result = onto_engine_large.run()

09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Initialized OntoMap Engine module
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Running Ontology Mapping
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Separating exact and non-exact matches
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ANSC → Anal Squamous Cell Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: APAD → Appendiceal Adenocarcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ASPS → Alveolar Soft Part Sarcoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ATRT → Atypical Teratoid/Rhabdoid Tumor
09/07/2025 03:37:15 PM - INF

No sentence-transformers model found with name model_cache/mt-sap-bert. Creating a new one with mean pooling.
Batches: 100%|██████████| 16/16 [00:00<00:00, 22.10it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 62.87it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 65.62it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 73.16it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 75.38it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 71.31it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.76it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 72.96it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 75.97it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 73.88it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.75it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.94it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 64.52it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.14it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 71.41it/s]
Batches: 100%|██████████| 16/16 [00

In [None]:
# Calculate Top1, Top3, and Top5 accuracy for the generated results

st_sapbert_accuracy_df = calc.calc_accuracy(st_sapbert_disease_top20_result)
print(st_sapbert_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  75.241779
1  Top 3 Matches  84.526112
2  Top 5 Matches  87.685364


In [None]:
# Save the results to a CSV file for further analysis or reporting. Optional.

st_sapbert_disease_top20_result.to_csv(
    "st_sapbert_disease_top20_result.csv",
    index=False)

In [None]:
# RAG Strategy: Need corpus_df for concept retrieval.
# Example: 
other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='rag',
                                      corpus_df=large_corpus,
                                      **other_params)
pubmed_bert_result = onto_engine_large.run()

In [7]:
# rag_bie Strategy: Need corpus_df for concept retrieval and query_df for query enrichment.

# Example: 

# Note: rag_bie is a query-enriched variant of RAG, so we have to use query with expanded fields.
query_df = pd.read_csv("data/corpus/cbio_disease/query_with_selected_fields_for_bie.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = query_df['original_cancer_type_value'].tolist() # TODO: use a common schema for all strategies.
large_corpus_list = large_corpus['official_label'].tolist() 

cura_map = dict(zip(query_df['original_cancer_type_value'], query_df['official_label']))

# run rag_bie strategy:
other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='rag_bie',
                                      corpus_df=large_corpus,
                                      query_df=query_df,
                                      **other_params)
pubmedbert_rag_bie_result = onto_engine_large.run()

10/07/2025 12:13:22 PM - INFO - OntoMapEngine: Initialized OntoMap Engine module
10/07/2025 12:13:22 PM - INFO - OntoMapEngine: Running Ontology Mapping
10/07/2025 12:13:22 PM - INFO - OntoMapEngine: Separating exact and non-exact matches
10/07/2025 12:13:22 PM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
10/07/2025 12:13:22 PM - INFO - OntoMapBIE: Initialized Bi-Encoder (query with context) module


Adding context to query_df: 100%|██████████| 214/214 [01:12<00:00,  2.93it/s]
Processing queries (Bi-Encoder):   0%|          | 0/214 [00:00<?, ?it/s]

10/07/2025 12:14:38 PM - INFO - FAISSSQLiteSearch: 13943 new terms to add to the index.
10/07/2025 12:14:38 PM - INFO - FAISSSQLiteSearch: Using provided DataFrame to update term-code pairs.
10/07/2025 12:14:38 PM - INFO - FAISSSQLiteSearch: Using provided DataFrame to fetch term-code pairs.
10/07/2025 12:14:38 PM - INFO - FAISSSQLiteSearch: Retrieved codes for 13924 terms
10/07/2025 12:14:38 PM - INFO - FAISSSQLiteSearch: Fetching concept data for 13927 unique codes
10/07/2025 12:14:38 PM - INFO - NCIDb: Fetching concept data for 13927 codes in batches of 50
10/07/2025 12:14:43 PM - INFO - NCIDb: Processed batch 2 of 279
10/07/2025 12:14:43 PM - INFO - NCIDb: Processed batch 1 of 279
10/07/2025 12:14:49 PM - INFO - NCIDb: Processed batch 4 of 279
10/07/2025 12:14:52 PM - INFO - NCIDb: Processed batch 5 of 279
10/07/2025 12:14:52 PM - INFO - NCIDb: Processed batch 3 of 279
10/07/2025 12:14:54 PM - INFO - NCIDb: Processed batch 6 of 279
10/07/2025 12:14:57 PM - INFO - NCIDb: Processed b

Building context and records: 100%|██████████| 13924/13924 [00:01<00:00, 12394.49it/s]

10/07/2025 12:27:32 PM - INFO - FAISSSQLiteSearch: Inserting 13927 records into SQLite





10/07/2025 12:27:33 PM - INFO - FAISSSQLiteSearch: Starting vector embedding and FAISS index insertion



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:03<00:00,  8.76it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:03<00:00,  9.53it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:03<00:00,  9.35it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:03<00:00, 10.12it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:02<00:00, 11.14it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 100%|██████████| 32/32 [00:02<00:00, 12.20it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Batches: 10

10/07/2025 12:29:03 PM - INFO - FAISSSQLiteSearch: Finished fetching and storing all terms.
10/07/2025 12:29:03 PM - INFO - OntoMapBIE: True - Vector store initialized for method=pubmed-bert, category=disease, om_strategy=rag_bie



Batches: 100%|██████████| 1/1 [00:00<00:00, 46.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.05it/s]4 [14:27<51:20:56, 867.87s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 97.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 91.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 97.69it/s]4 [14:27<6:09:32, 106.60s/it] 
Batches: 100%|██████████| 1/1 [00:00<00:00, 93.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 97.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 90.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 75.34it/s]14 [14:28<2:23:22, 42.59s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 92.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 96.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.94it/s]
Batches: 100%|██████████| 1/1 

10/07/2025 12:29:08 PM - INFO - OntoMapBIE: Bi-Encoder Results Generated


In [8]:
pubmedbert_rag_bie_accuracy_df = calc.calc_accuracy(pubmedbert_rag_bie_result)
print(pubmedbert_rag_bie_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  42.253521
1  Top 3 Matches  60.093897
2  Top 5 Matches  69.953052


Workflow Code

In [25]:
from graphviz import Digraph

dot = Digraph(comment='Unified Ontology Mapping Workflow')
dot.attr(rankdir='TB', fontsize='10', fontname='Helvetica')

# Input
dot.node('A', 'Input Arguments\n(e.g., query, model, df, config)', shape='box')

# Stage 1
dot.node('C', 'Exact Match?', shape='diamond')
dot.node('D', 'Exact Matched Terms\n(Stored for Final Merge)', shape='box')

# Abbreviation
dot.node('E', 'Unmatched Terms:\nReplace Abbreviations\n(via abbreviation dict)', shape='box')

# Strategy
dot.node('F', 'Choose Strategy:\nST / LM / RAG_BIE / RAG', shape='box', style='filled', fillcolor='lightblue')

# FAISS & DB check
dot.node('G', 'Check FAISS Index &\nSQLite Table', shape='box')
dot.node('H', 'Encode Corpus Terms (LM/ST)\nor Corpus Contexts (RAG/RAG_BIE)\n→ Store in FAISS & SQLite', shape='box')
dot.node('I', 'Check Completeness\n→ Append Missing if Needed', shape='box')

# Query Encoding Phase
dot.node('J1', 'ST / LM / RAG:\nEncode Query Terms', shape='box', style='filled', fillcolor='lightyellow')
dot.node('J3', 'RAG_BIE:\nEnrich + Encode Query', shape='box', style='filled', fillcolor='lightyellow')

# Shared FAISS Search node
dot.node('SEARCH', 'FAISS Search', shape='ellipse', style='filled', fillcolor='orange')

# Merge & Output
dot.node('M1', 'Merge Exact Match +\nStrategy Results', shape='box', style='filled', fillcolor='lightgreen')
dot.node('M2', 'Final Output\nDataFrame', shape='box', style='filled', fillcolor='lightgreen')

# Edges
dot.edge('A', 'C')
dot.edge('C', 'D', label='Yes')
dot.edge('C', 'E', label='No')
dot.edge('E', 'F')
dot.edge('F', 'G')
dot.edge('G', 'H', label='Not Exist')
dot.edge('G', 'I', label='Exist')

dot.edge('H', 'J1')
dot.edge('H', 'J3')
dot.edge('I', 'J1')
dot.edge('I', 'J3')

dot.edge('J1', 'SEARCH')
dot.edge('J3', 'SEARCH')

dot.edge('D', 'M1')
dot.edge('SEARCH', 'M1')
dot.edge('M1', 'M2')

# Optional note
dot.node('NOTE', 'Note: \nRAG_BIE is a query-enriched variant of RAG.\nIt may be merged into RAG after development.', shape='note', style='dashed', fontsize='9', fontcolor='slategray')

# Position the note under the strategy box
dot.edge('F', 'NOTE', style='dashed')

# Render
dot.render('ontology_mapping_test_workflow', format='png', cleanup=True)


'ontology_mapping_test_workflow.png'