In [None]:
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    stop_after_delay,
)
import openai
import spacy

from olaf import Pipeline
from olaf.commons.errors import MissingEnvironmentVariable
from olaf.commons.kr_to_rdf_tools import (
    kr_concepts_to_owl_classes, kr_relations_to_owl_obj_props, 
    kr_metarelations_to_owl, kr_relations_to_anonymous_some_parent, concept_lrs_to_owl_individuals
)
from olaf.commons.logging_config import logger
from olaf.commons.llm_tools import LLMGenerator
from olaf.pipeline.pipeline_component.axiom_extraction import OWLAxiomExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import (
    AgglomerativeClusteringConceptExtraction, 
    AgglomerativeClusteringRelationExtraction
)
from olaf.pipeline.pipeline_component.concept_relation_hierarchy import LLMBasedHierarchisation
from olaf.pipeline.pipeline_component.term_extraction import LLMTermExtraction
from olaf.repository.serialiser import KRJSONSerialiser

In [None]:
DATA_PATH = ""
KR_PATH = ""
ONTO_OWL_PATH = ""

-----------

# KG building

In [None]:
with open(DATA_PATH, 'r') as f:
    file_content = f.read()

In [3]:
corpus = [" ".join(extract.lower().split()) for extract in file_content.split('#') if len(extract)]

In [4]:
spacy_model = spacy.load("fr_core_news_lg")
spacy_corpus = list(spacy_model.pipe(corpus))

In [5]:
class GPT4oMiniGenerator(LLMGenerator):
    """Text generator based on OpenAI gpt-4o-mini model."""

    def check_resources(self) -> None:
        """Check that the resources needed to use the OpenAI Generator are available."""
        if "OPENAI_API_KEY" not in os.environ:
            raise MissingEnvironmentVariable(self.__class__, "OPENAI_API_KEY")

    def generate_text(self, prompt: list[dict[str, str]]) -> str:
        """Generate text based on a chat completion prompt for the OpenAI gtp-4o-mini model."""

        @retry(
            stop=stop_after_delay(15) | stop_after_attempt(3),
            retry=(
                retry_if_exception_type(
                    openai.APIConnectionError
                    | openai.APITimeoutError
                    | openai.RateLimitError
                    | openai.InternalServerError
                )
            ),
            reraise=True,
        )
        def openai_call():
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                temperature=0,
                messages=prompt,
            )
            return response

        llm_output = ""
        client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        try:
            response = openai_call()
            llm_output = response.choices[0].message.content
        except Exception as e:
            logger.error(
                """Exception %s still occurred after retries on OpenAI API.
                         Skipping document %s...""",
                e,
                prompt[-1]["content"][5:100],
            )

        return llm_output
    
llm_model = GPT4oMiniGenerator()

In [6]:
pipeline = Pipeline(spacy_model, corpus=spacy_corpus)

In [8]:
def prompt_cterm_extraction(context: str) -> list[dict[str, str]]:
    """Prompt template for concept term extraction with ChatCompletion OpenAI model.

    Parameters
    ----------
    context: str
        The context to add in the prompt template.

    Returns
    -------
    list[dict[str, str]]
        ChatCompletion prompt template.
    """
    prompt_template = [
        {
            "role": "system",
            "content": "You are an helpful assistant helping building an ontology.",
        },
        {
            "role": "user",
            "content": "Extract the most meaningful keywords of the following text. Keep only keywords that could be concepts and not relations. Write them as a list of string with double quotes.",
        },
        {
            "role": "user",
            "content": 'Here is an example. Text: This python package is about ontology learning. I do not know a lot about this field.\n["python package", "ontology learning", "field"]',
        },
        {"role": "user", "content": f"Text: {context}"},
    ]
    return prompt_template
llm_cterm_extraction = LLMTermExtraction(prompt_cterm_extraction, llm_model)
llm_cterm_extraction.run(pipeline)

In [None]:
ac_concept_extraction = AgglomerativeClusteringConceptExtraction(distance_threshold=0.2, embedding_model="dangvantuan/sentence-camembert-large")
ac_concept_extraction.run(pipeline)

In [12]:
def prompt_rterm_extraction(context: str) -> list[dict[str, str]]:
    """Prompt template for relation term extraction with ChatCompletion OpenAI model.

    Parameters
    ----------
    context: str
        The context to add in the prompt template.

    Returns
    -------
    List[Dict[str, str]]
        ChatCompletion prompt template.
    """
    prompt_template = [
        {
            "role": "system",
            "content": "You are an helpful assistant helping building an ontology.",
        },
        {
            "role": "user",
            "content": "Extract the most meaningful words describing actions or states in the following text. Keep only words that could be relations and not concepts. Write them as a list of string with double quotes.",
        },
        {
            "role": "user",
            "content": 'Here is an example. Text: I plan to eat pizza tonight. I am looking for advice.\n["plan", "eat", "looking for"]',
        },
        {"role": "user", "content": f"Text: {context}"},
    ]
    return prompt_template

llm_rterm_extraction = LLMTermExtraction(prompt_rterm_extraction, llm_model)
llm_rterm_extraction.run(pipeline)

In [None]:
ac_relation_extraction = AgglomerativeClusteringRelationExtraction(distance_threshold=0.2, embedding_model="dangvantuan/sentence-camembert-large", concept_max_distance=8, scope="sent")
ac_relation_extraction.run(pipeline)

In [None]:
def prompt_hierarchisation(
    doc_context: str, concepts_description: str
) -> list[dict[str, str]]:
    """Prompt template for hierarchisation with ChatCompletion OpenAI model.

    Parameters
    ----------
    doc_context: str
        Extract of document contents where concepts appear to use as context.
    concepts_description: str
        Textual description of the concepts.

    Returns
    -------
    List[Dict[str, str]]
        ChatCompletion prompt template.
    """
    prompt_template = [
        {
            "role": "system",
            "content": "You are an helpful assistant helping building an ontology.",
        },
        {
            "role": "user",
            "content": """Based on the context given, define if there is a relevant hierarchy between the listed concepts.
            The result should be given as a list of list of string with double quotes without any other content.""",
        },
        {
            "role": "user",
            "content": """Here is an example. Concepts: animal, mammal, dog(canine), flower
            [["mammal","is_generalised_by","animal"], ["dog","is_generalised_by","mammal"], ["dog","is_generalised_by","animal"]]""",
        },
        {"role": "user", "content": f"Context: {doc_context}"},
        {"role": "user", "content": concepts_description},
    ]
    return prompt_template

llm_concept_hierarchy = LLMBasedHierarchisation(prompt_hierarchisation, llm_model, 20000)
llm_concept_hierarchy.run(pipeline)

In [None]:
kr_serialiser = KRJSONSerialiser()
kr_serialiser.serialise(kr=pipeline.kr, file_path=KR_PATH)

In [None]:
axiom_generators = {    
    kr_concepts_to_owl_classes,
    kr_relations_to_owl_obj_props,
    kr_metarelations_to_owl,
    kr_relations_to_anonymous_some_parent,
    concept_lrs_to_owl_individuals
}
owl_axiom_extraction = OWLAxiomExtraction(
    owl_axiom_generators=axiom_generators,
    base_uri="https://github.com/schmarion/phd-code/o/example#"
)

owl_axiom_extraction.run(pipeline)

In [None]:
pipeline.kr.rdf_graph.serialize(ONTO_OWL_PATH, format="ttl")