# Plan de Experimento - Evaluación generación de ontologías con y sin RAG

**Objetivo:** Evaluar la capacidad del LLM (Mistral-7B-Instruct-v0.3) para generar TTL válido:
- Sin RAG (zero-shot)
- Con RAG (con contexto de la ontología oficial + dataset)

**Ontología oficial:** tad.txt  
**Dataset:** CancerEnD_1000.nt  
**Métricas:**
- Cobertura clases %
- Cobertura propiedades %
- Nuevas propiedades
- Inconsistencias
- Nota manual (calidad TTL)

In [None]:
# Imports y configuración
from rdflib import Graph, URIRef
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import pipeline

model_path_embed = "/Users/franciscosaez/.lmstudio/models/sentence_transformers/all-MiniLM-L6-v2"
model_path_llm = "/Users/franciscosaez/Downloads/Mistral-7B-Instruct-v0.3"
ttl_path = "tad.txt"
nt_path = "./ontologias_experimentos_RAG/CancerEnD_1000.nt"

# Inicializar modelos
model = SentenceTransformer(model_path_embed)

client = chromadb.Client()
collection = client.get_or_create_collection("biological_data_full")

## Indexación TTL / NT

In [None]:
# TTL
g = Graph()
g.parse(ttl_path, format="turtle")

ttl_fragments = []
for s in g.subjects():
    triples = list(g.triples((s, None, None)))
    if triples:
        frag = "\n".join([f"{str(subj)} {str(pred)} {str(obj)} ." for (subj, pred, obj) in triples])
        ttl_fragments.append(frag)

print(f"TTL: {len(ttl_fragments)} fragmentos.")

# NT
with open(nt_path, "r") as file:
    tripletas = [line.strip() for line in file.readlines() if line.strip()]

block_size = 20
nt_fragments = [
    "\n".join(tripletas[i:i+block_size])
    for i in range(0, len(tripletas), block_size)
]

print(f"NT: {len(nt_fragments)} fragmentos.")

## Inserción en ChromaDB

In [None]:
# TTL
for i, frag in enumerate(ttl_fragments):
    embedding = model.encode([frag])[0].tolist()
    collection.add(
        documents=[frag],
        ids=[f"ttl_block_{i}"],
        embeddings=[embedding]
    )

# NT
for i, frag in enumerate(nt_fragments):
    embedding = model.encode([frag])[0].tolist()
    collection.add(
        documents=[frag],
        ids=[f"nt_block_{i}"],
        embeddings=[embedding]
    )

print("Indexación completa ✅")

## Query y recuperación de fragmentos

In [None]:
query = "gene-enhancer associations with anatomical context"
query_embedding = model.encode([query])[0].tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

for doc_id, doc_content in zip(results['ids'][0], results['documents'][0]):
    print(f"\nID: {doc_id}\n---\n{doc_content}\n")

## Generación Prompt automática

In [None]:
ttl_fragment = None
nt_fragment = None

for doc_id, doc_content in zip(results['ids'][0], results['documents'][0]):
    if ttl_fragment is None and doc_id.startswith("ttl_block_"):
        ttl_fragment = doc_content
    if nt_fragment is None and doc_id.startswith("nt_block_"):
        nt_fragment = doc_content
    if ttl_fragment and nt_fragment:
        break

prompt = f"""
You are an expert in semantic knowledge modeling and ontology engineering.

Ontology fragment:
{ttl_fragment}

Dataset fragment:
{nt_fragment}

Task:
- For each subject in the dataset, propose:
    - The correct class (from ontology or new if needed)
    - The mapping of each property to an ontology property
    - If no property matches, propose a new one with domain/range.

Return your output as a valid Turtle RDF block.
"""

print(prompt)

## Llamada al LLM (Mistral)

In [None]:
pipe = pipeline(
    "text-generation",
    model=model_path_llm,
    torch_dtype="auto",
    device_map="auto"
)

resultado = pipe(
    prompt,
    max_new_tokens=2048
)

respuesta_llm = resultado[0]["generated_text"]
print(respuesta_llm)

## Guardar TTL generado

In [None]:
output_path = "./ontologias_experimentos_RAG/ontologia_generada_Q1.ttl"

with open(output_path, "w") as f:
    f.write(respuesta_llm)

print(f"Ontología generada guardada en: {output_path}")

## Evaluación automática

In [None]:
generated_path = "./ontologias_experimentos_RAG/ontologia_generada_Q1.ttl"

official_graph = Graph()
official_graph.parse(ttl_path, format="turtle")

generated_graph = Graph()
generated_graph.parse(generated_path, format="turtle")

official_classes = set(official_graph.subjects(predicate=None, object=URIRef("http://www.w3.org/2002/07/owl#Class")))
generated_classes = set(generated_graph.subjects(predicate=None, object=URIRef("http://www.w3.org/2002/07/owl#Class")))

official_properties = set(official_graph.predicates())
generated_properties = set(generated_graph.predicates())

coverage_classes = len(generated_classes & official_classes) / len(official_classes) * 100
coverage_properties = len(generated_properties & official_properties) / len(official_properties) * 100

new_properties = generated_properties - official_properties
missing_properties = official_properties - generated_properties

print(f"Coverage classes: {coverage_classes:.1f}%")
print(f"Coverage properties: {coverage_properties:.1f}%")
print(f"Nuevas propiedades: {len(new_properties)}")
print(f"Propiedades faltantes: {len(missing_properties)}")