In [None]:
# First, change the working directory to the project root.
# Example: %cd /home/your_username/MetaHarmonizer

%cd /home/lcc/projects/MetaHarmonizer

/home/lcc/projects/MetaHarmonizer


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
# This is required to run asyncio code in Jupyter notebooks.
# Jupyter already runs its own event loop, so calling asyncio.run() directly would raise an error.
# nest_asyncio.apply() patches the loop to allow nested usage.

import nest_asyncio

nest_asyncio.apply()

In [None]:
# Import core modules and ontology mapper implementations

import pandas as pd
from importlib import reload

# Import different ontology mapping strategies (ST, LM, RAG, Bi-Encoder)
from src.models import ontology_mapper_st as om_st
from src.models import ontology_mapper_lm as om_lm
from src.models import ontology_mapper_rag_faiss as om_rag
from src.models import ontology_mapper_bi_encoder as om_bi

# Import the engine that handles pipeline logic and integrates the mappers
from src.Engine import ontology_mapping_engine as ome

# Reload modules to reflect any code updates during development (useful in Jupyter)
reload(om_st)
reload(om_lm)
reload(om_rag)
reload(om_bi)
reload(ome)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/lcc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lcc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lcc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lcc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<module 'src.models.ontology_mapper_bi_encoder' from '/home/lcc/projects/MetaHarmonizer/src/models/ontology_mapper_bi_encoder.py'>

In [None]:
# Import the CalcStats class for calculating Top1, Top3, and Top5 accuracy

from src.models.calc_stats import CalcStats

calc = CalcStats()

In [None]:
# Optional utility: Clean up the FAISS + SQLite vector store
# Useful after testing or re-running experiments to avoid stale data
# Not required for standard inference or training runs

from src.utils.cleanup_vector_store import cleanup_vector_store

cleanup_vector_store("st", "mt-sap-bert", "disease")  # (strategy, model_name, entity_type)

[Success] Table 'st_mt_sap_bert_disease' dropped from src/KnowledgeDb/vector_db.sqlite
[Success] Index file 'src/KnowledgeDb/faiss_indexes/st_mt-sap-bert_disease.index' deleted.


DF

In [None]:
df = pd.read_csv("data/corpus/cbio_disease/disease_query_updated.csv")
large_corpus = pd.read_csv(
    'data/corpus/cbio_disease/disease_corpus_updated.csv')

query_list = df['original_value'].tolist()
small_corpus_list = df['curated_ontology'].tolist() # Small corpus list is made of curated ontology values in the query file
large_corpus_list = large_corpus['official_label'].tolist() 

cura_map = dict(zip(df['original_value'], df['curated_ontology']))

In [None]:
# ST Strategy: use default pooling method. LM Strategy: use CLS token pooling method.
# Arguments:
# - method
# - category
# - topk: retrieve the top k matches
# - query: list of original values to map
# - corpus: list of curated ontology values to match against
# - cura_map: dictionary mapping original values to curated ontology values
# Returns: DF with original values, curated ontology values, match levels, stage, and top k matches with scores

other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='mt-sap-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='st',
                                      **other_params)
st_sapbert_disease_top20_result = onto_engine_large.run()

09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Initialized OntoMap Engine module
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Running Ontology Mapping
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Separating exact and non-exact matches
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replacing shortNames using rule-based name mapping
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ACC → Adrenocortical Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ACYC → Adenoid Cystic Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: AML → Acute Myeloid Leukemia
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ANSC → Anal Squamous Cell Carcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: APAD → Appendiceal Adenocarcinoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ASPS → Alveolar Soft Part Sarcoma
09/07/2025 03:37:15 PM - INFO - OntoMapEngine: Replaced: ATRT → Atypical Teratoid/Rhabdoid Tumor
09/07/2025 03:37:15 PM - INF

No sentence-transformers model found with name model_cache/mt-sap-bert. Creating a new one with mean pooling.
Batches: 100%|██████████| 16/16 [00:00<00:00, 22.10it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 62.87it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 65.62it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 73.16it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 75.38it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 71.31it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.76it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 72.96it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 75.97it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 73.88it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.75it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.94it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 64.52it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 70.14it/s]
Batches: 100%|██████████| 16/16 [00:00<00:00, 71.41it/s]
Batches: 100%|██████████| 16/16 [00

In [None]:
# Calculate Top1, Top3, and Top5 accuracy for the generated results

st_sapbert_accuracy_df = calc.calc_accuracy(st_sapbert_disease_top20_result)
print(st_sapbert_accuracy_df)

  Accuracy Level   Accuracy
0    Top 1 Match  75.241779
1  Top 3 Matches  84.526112
2  Top 5 Matches  87.685364


In [None]:
# Save the results to a CSV file for further analysis or reporting. Optional.

st_sapbert_disease_top20_result.to_csv(
    "st_sapbert_disease_top20_result.csv",
    index=False)

In [None]:
# RAG Strategy: Need corpus_df for concept retrieval.
# Example: 
other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=5,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='rag',
                                      corpus_df=large_corpus,
                                      **other_params)
pubmed_bert_result = onto_engine_large.run()

In [None]:
# BIE Strategy: Need corpus_df for concept retrieval and query_df for query enrichment.
# Example: 
other_params = {"test_or_prod": "test"}
onto_engine_large = ome.OntoMapEngine(method='pubmed-bert',
                                      category='disease',
                                      topk=20,
                                      query=query_list,
                                      corpus=large_corpus_list,
                                      cura_map=cura_map,
                                      om_strategy='bie',
                                      corpus_df=large_corpus,
                                      query_df=df,
                                      **other_params)
pubmedbert_bie_result = onto_engine_large.run()