In [None]:
import ontoaligner
import time

## MaterialInformation-MatOnto

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.MaterialInformationMatOntoOMDataset,
    source_ontology_path="assets/MI-MatOnto/mi_ontology.xml",
    target_ontology_path="assets/MI-MatOnto/matonto_ontology.xml",
    reference_matching_path="assets/MI-MatOnto/matchings.xml",
    output_dir="results",
    output_format="json"
)

In [3]:
llm_path='mistralai/Mistral-7B-v0.3'
retriever_path='all-MiniLM-L6-v2'
method = "fewshot-rag"
ir_threshold = 0.4
llm_threshold = 0.4

start = time.time()
matchings, evaluation = pipeline(
    method=method, 
    encoder_model=ontoaligner.encoder.ConceptChildrenFewShotEncoder(),
    model_class=ontoaligner.ontology_matchers.MistralLLMBERTRetrieverFSRAG, 
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path, 
    retriever_path=retriever_path, 
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    n_shots=2,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda', 
    batch_size=32, 
    return_matching=True, 
    evaluate=True,
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
print("RS:", time.time() - start)

evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/52 [00:00<?, ?it/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

545it [00:00, 9972.54it/s]
100%|██████████| 545/545 [00:00<00:00, 425996.21it/s]


No of random_positive_examples examples: 1
No of random_negative_examples examples: 1


  0%|          | 0/86 [00:00<?, ?it/s]From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
100%|██████████| 86/86 [05:30<00:00,  3.84s/it]
100%|██████████| 444/444 [00:00<00:00, 176837.05it/s]

ir_threshold: 0.4
llm_threshold: 0.4
EVAL: {'intersection': 102, 'precision': 65.38461538461539, 'recall': 33.77483443708609, 'f-score': 44.54148471615721, 'predictions-len': 156, 'reference-len': 302}
----------------------------------------
RS: 342.7561044692993





{'intersection': 102,
 'precision': 65.38461538461539,
 'recall': 33.77483443708609,
 'f-score': 44.54148471615721,
 'predictions-len': 156,
 'reference-len': 302}

# Fish-Zooplankton

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.FishZooplanktonOMDataset,
    source_ontology_path="assets/fish-zooplankton/source.xml",
    target_ontology_path="assets/fish-zooplankton/target.xml",
    reference_matching_path="assets/fish-zooplankton/reference.xml",
    output_dir="results",
    output_format="json"
)

In [5]:
start = time.time()
method = "rag"
llm_path='meta-llama/Llama-3.2-3B'
retriever_path='all-MiniLM-L6-v2'
ir_threshold = 0.7
llm_threshold = 0.95
huggingface_access_token= ""

matchings, evaluation = pipeline(
    method=method,
    encoder_model=ontoaligner.encoder.ConceptRAGEncoder(),
    model_class=ontoaligner.ontology_matchers.LLaMALLMBERTRetrieverRAG,
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path,
    retriever_path=retriever_path,
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda',
    batch_size=128,
    return_matching=True,
    evaluate=True,
    huggingface_access_token=huggingface_access_token,
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
print("RS:", time.time() - start)
evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/207 [00:00<?, ?it/s]

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

2737it [00:00, 16323.80it/s]
100%|██████████| 2737/2737 [00:00<00:00, 727076.45it/s]
100%|██████████| 60/60 [01:47<00:00,  1.79s/it]
100%|██████████| 7592/7592 [00:00<00:00, 39242.53it/s]


ir_threshold: 0.7
llm_threshold: 0.95
EVAL: {'intersection': 1291, 'precision': 87.7038043478261, 'recall': 85.15831134564644, 'f-score': 86.41231593038823, 'predictions-len': 1472, 'reference-len': 1516}
----------------------------------------
RS: 116.54034519195557


{'intersection': 1291,
 'precision': 87.7038043478261,
 'recall': 85.15831134564644,
 'f-score': 86.41231593038823,
 'predictions-len': 1472,
 'reference-len': 1516}

# Mouse-Human 

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.MouseHumanOMDataset,
    source_ontology_path="assets/mouse-human/source.xml",
    target_ontology_path="assets/mouse-human/target.xml",
    reference_matching_path="assets/mouse-human/reference.xml",
    output_dir="results",
    output_format="json"
)

In [5]:
start = time.time()
method = "rag"
llm_path='meta-llama/Llama-3.2-3B'
retriever_path='all-MiniLM-L6-v2'
ir_threshold = 0.7
llm_threshold = 0.95
huggingface_access_token= ""

matchings, evaluation = pipeline(
    method=method,
    encoder_model=ontoaligner.encoder.ConceptRAGEncoder(),
    model_class=ontoaligner.ontology_matchers.LLaMALLMBERTRetrieverRAG,
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path,
    retriever_path=retriever_path,
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda',
    batch_size=128,
    return_matching=True,
    evaluate=True,
    huggingface_access_token=huggingface_access_token,
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
print("RS:", time.time() - start)
evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/207 [00:00<?, ?it/s]

Batches:   0%|          | 0/172 [00:00<?, ?it/s]

2737it [00:00, 16323.80it/s]
100%|██████████| 2737/2737 [00:00<00:00, 727076.45it/s]
100%|██████████| 60/60 [01:47<00:00,  1.79s/it]
100%|██████████| 7592/7592 [00:00<00:00, 39242.53it/s]


ir_threshold: 0.7
llm_threshold: 0.95
EVAL: {'intersection': 1291, 'precision': 87.7038043478261, 'recall': 85.15831134564644, 'f-score': 86.41231593038823, 'predictions-len': 1472, 'reference-len': 1516}
----------------------------------------
RS: 116.54034519195557


{'intersection': 1291,
 'precision': 87.7038043478261,
 'recall': 85.15831134564644,
 'f-score': 86.41231593038823,
 'predictions-len': 1472,
 'reference-len': 1516}

# Macroalgae-Macrozoobenthos

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.MacroalgaeMacrozoobenthosOMDataset,
    source_ontology_path="assets/macroalgae-macrozoobenthos/source.xml",
    target_ontology_path="assets/macroalgae-macrozoobenthos/target.xml",
    reference_matching_path="assets/macroalgae-macrozoobenthos/reference.xml",
    output_dir="results",
    output_format="json"
)

In [5]:
class QwenLLMTFIDFRetrieverRAG(ontoaligner.ontology_matchers.RAG):
    Retrieval = ontoaligner.ontology_matchers.TFIDFRetrieval
    LLM = ontoaligner.ontology_matchers.AutoModelDecoderRAGLLMV2

    def __str__(self):
        return super().__str__() + "-QwenLLMTFIDFRetrieverRAG"
    
    
start = time.time()
method = "rag"
llm_threshold = 0.8
ir_threshold = 0.2
llm_path='Qwen/Qwen2-0.5B'

matchings, evaluation = pipeline(
    method=method,
    encoder_model=ontoaligner.encoder.ConceptRAGEncoder(),
    model_class=QwenLLMTFIDFRetrieverRAG,
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path,
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda',
    batch_size=64,
    return_matching=True,
    evaluate=True
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
t = time.time() - start
print("RS:", t)
evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
108it [00:00, 1506.91it/s]
100%|██████████| 108/108 [00:00<00:00, 351151.03it/s]
100%|██████████| 2/2 [00:00<00:00,  2.71it/s]
100%|██████████| 77/77 [00:00<00:00, 153644.82it/s]

ir_threshold: 0.0
llm_threshold: 0.8
EVAL: {'intersection': 12, 'precision': 75.0, 'recall': 66.66666666666666, 'f-score': 70.58823529411765, 'predictions-len': 16, 'reference-len': 18}
----------------------------------------
RS: 2.804941177368164





{'intersection': 12,
 'precision': 75.0,
 'recall': 66.66666666666666,
 'f-score': 70.58823529411765,
 'predictions-len': 16,
 'reference-len': 18}

# Nell-Dbpedia

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.NellDbpediaOMDataset,
    source_ontology_path="assets/nell-dbpedia/source.xml",
    target_ontology_path="assets/nell-dbpedia/target.xml",
    reference_matching_path="assets/nell-dbpedia/reference.xml",
    output_dir="results",
    output_format="json"
)

In [25]:
class QwenLLMBERTRetrieverRAG(ontoaligner.ontology_matchers.RAG):
    Retrieval = ontoaligner.ontology_matchers.SBERTRetrieval
    LLM = ontoaligner.ontology_matchers.AutoModelDecoderRAGLLMV2

    def __str__(self):
        return super().__str__() + "-QwenLLMBERTRetrieverRAG"
    
    
start = time.time()
method = "rag"
llm_threshold = 0.7
ir_threshold = 0.2
llm_path='Qwen/Qwen2-0.5B'
ir_path='sentence-transformers/sentence-t5-base'

matchings, evaluation = pipeline(
    method=method,
    encoder_model=ontoaligner.encoder.ConceptRAGEncoder(),
    model_class=QwenLLMBERTRetrieverRAG,
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path,
    retriever_path=ir_path,
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda',
    batch_size=2048,
    return_matching=True,
    evaluate=True
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
t = time.time() - start
print("RS:", t)
evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

134it [00:00, 23359.80it/s]
100%|██████████| 134/134 [00:00<00:00, 264986.67it/s]
100%|██████████| 1/1 [00:01<00:00,  1.02s/it]
100%|██████████| 670/670 [00:00<00:00, 314408.56it/s]

ir_threshold: 0.0
llm_threshold: 0.7
EVAL: {'intersection': 126, 'precision': 97.67441860465115, 'recall': 97.67441860465115, 'f-score': 97.67441860465115, 'predictions-len': 129, 'reference-len': 129}
----------------------------------------
RS: 5.911125183105469





{'intersection': 126,
 'precision': 97.67441860465115,
 'recall': 97.67441860465115,
 'f-score': 97.67441860465115,
 'predictions-len': 129,
 'reference-len': 129}

# Yago-Wikidata

In [None]:
pipeline = ontoaligner.OntoAlignerPipeline(
    task_class=ontoaligner.ontology.YagoWikidataOMDataset,
    source_ontology_path="assets/yago-wikidata/source.xml",
    target_ontology_path="assets/yago-wikidata/target.xml",
    reference_matching_path="assets/yago-wikidata/reference.xml",
    output_dir="results",
    output_format="json"
)

In [5]:
class MinistralLLMBERTRetrieverRAG(ontoaligner.ontology_matchers.RAG):
    Retrieval = ontoaligner.ontology_matchers.SBERTRetrieval
    LLM = ontoaligner.ontology_matchers.AutoModelDecoderRAGLLM

    def __str__(self):
        return super().__str__() + "-MinistralLLMBERTRetrieverRAG"
    

start = time.time()
method = "rag"
llm_threshold = 0.5
ir_threshold = 0.2
llm_path='ministral/Ministral-3b-instruct'
ir_path='sentence-transformers/sentence-t5-base'

matchings, evaluation = pipeline(
    method=method,
    encoder_model=ontoaligner.encoder.ConceptRAGEncoder(),
    model_class=MinistralLLMBERTRetrieverRAG,
    postprocessor=ontoaligner.postprocess.rag_hybrid_postprocessor,
    llm_path=llm_path,
    retriever_path=ir_path,
    llm_threshold=llm_threshold,
    ir_rag_threshold=ir_threshold,
    top_k=5,
    max_length=512,
    max_new_tokens=10,
    device='cuda',
    batch_size=64,
    return_matching=True,
    evaluate=True
)
print("ir_threshold:", ir_threshold)
print("llm_threshold:", llm_threshold)
print("EVAL:", evaluation)
print("----"*10)
t = time.time() - start
print("RS:", t)
evaluation

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

304it [00:00, 8668.63it/s]
100%|██████████| 304/304 [00:00<00:00, 414386.88it/s]
100%|██████████| 24/24 [00:26<00:00,  1.10s/it]
100%|██████████| 1519/1519 [00:00<00:00, 214791.58it/s]

ir_threshold: 0.0
llm_threshold: 0.5
EVAL: {'intersection': 283, 'precision': 99.29824561403508, 'recall': 93.0921052631579, 'f-score': 96.09507640067912, 'predictions-len': 285, 'reference-len': 304}
----------------------------------------
RS: 33.73451352119446





{'intersection': 283,
 'precision': 99.29824561403508,
 'recall': 93.0921052631579,
 'f-score': 96.09507640067912,
 'predictions-len': 285,
 'reference-len': 304}