In [None]:
# %% [Instalaci√≥n completa]
!apt-get update
!apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-spa

!pip install --upgrade pip
!pip install pykeen pdf2image pytesseract Pillow nltk networkx matplotlib transformers torch huggingface-hub  pymupdf unidecode spacy owlready2
!pip install langdetect
# Descarga recursos NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')  # √ötil para an√°lisis adicional

!python -m spacy download es_core_news_lg

In [None]:
!pip install mistralai

# Procesamiento del Texto

In [None]:
STOPWORDS = set(
    [
        "introducci√≥n",
        "m√©todo",
        "m√©todos",
        "resultado",
        "resultados",
        "discusi√≥n",
        "conclusi√≥n",
        "figura",
        "tabla",
        "referencia",
        "estudio",
        "an√°lisis",
        "datos",
        "art√≠culo",
        "secci√≥n",
        "mostrado",
        "usando",
        "usado",
        "basado",
        "encontrado",
        "tambi√©n",
        "sin embargo",
        "aunque",
        "a√±o",
        "a√±os",
        "tiempo",
        "alto",
        "bajo",
        "valor",
        "caso",
        "grupo",
        "et",
        "al",
        "probabilidad",
        "momento",
        "situaciones",
        "descubrir",
        "mantiene",
        "significaba",
        "quiz√°s",
        "debido",
        "uso",
        "hacer",
        "obtener",
        "puede",
        "podr√≠a",
        "listado",
        "conferencias",
        "antecedentes",
        "significancia",
        "derechos de autor",
        "autor",
        "fig",
        "ec",
        "vol",
    ]
)

BLACKLIST = {
    "fig", "figura", "figure", "tabla", "table", "cuadro", "doi", "issn",
    "url", "http", "www", "et", "al", "vol", "no", "p√°g", "pag", "ed",
    "estudio", "an√°lisis", "datos", "m√©todo", "resultado", "conclusi√≥n" # Palabras gen√©ricas
}

In [None]:
from idna import decode
import pdf2image
import fitz
import pytesseract
from unidecode import unidecode
import nltk
import hashlib
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations
from pathlib import Path
import string
import spacy
import os
import re

try:
    nlp = spacy.load("es_core_news_lg")
    print("‚úÖ Modelo spaCy 'es_core_news_lg' cargado correctamente.")
except OSError:
    print("‚ö†Ô∏è Error cargando spaCy. Aseg√∫rate de haber ejecutado la celda de instalaci√≥n.")


nltk.download("punkt")
nltk.download("stopwords")


ruta_local = os.path.expanduser("./model")
# Configurar la ruta de la carpeta
carpeta_txts = "./txts"
carpeta_pdfs = "./corpus"  # Carpeta en la ra√≠z del proyecto
carpeta_pdfs = Path(carpeta_pdfs)
article_txt = ""
all_articles_text = []

# Asegurarse de que la carpeta existe
if not carpeta_pdfs.exists():
    carpeta_pdfs.mkdir(parents=True, exist_ok=True)
    print(f"Carpeta '{carpeta_pdfs}' creada. Coloca tus PDFs all√≠.")
else:
    # Procesar todos los PDFs en la carpeta
    # Initialize article_txt to ensure it's set for each iteration or if no PDFs are processed
    article_txt = ""  # Esta variable se usar√° globalmente, pero la reiniciaremos por PDF

    for pdf_file in carpeta_pdfs.glob("*.pdf"):
        print(f"\nProcesando: {pdf_file.name}")

        article_txt = ""  # Variable local para cada PDF

        try:
            # Construir nombre del archivo txt
            pdf_file_name = pdf_file.with_suffix(".txt").name
            file_name = Path("./txts") / pdf_file_name

            print(file_name)

            # Si ya existe el .txt, cargarlo
            if file_name.exists():
                print(f"  El archivo {file_name.name} ya existe. Cargando texto existente...")
                with open(file_name, "r", encoding="utf-8") as f:
                    article_txt = f.read()
            else:
                # --- Extracci√≥n nueva del PDF ---
                doc = fitz.open(pdf_file)
                full_text_pages = []

                for page in doc:
                    blocks = page.get_text("blocks", sort=True)
                    page_text_parts = []
                    for block in blocks:
                        txt = block[4]
                        if not txt.strip() or len(txt) < 5:
                            continue
                        txt = re.sub(r'([a-z√°√©√≠√≥√∫√±])([A-Z√Å√â√ç√ì√ö√ë])', r'\1 \2', txt)
                        page_text_parts.append(txt)

                    full_text_pages.append("\n".join(page_text_parts))

                article_txt = "\n".join(full_text_pages)
                article_txt = re.sub(r'\s+', ' ', article_txt).strip()

                # Guardar el texto extra√≠do
                file_name.parent.mkdir(parents=True, exist_ok=True)
                with open(file_name, "w", encoding="utf-8") as f:
                    f.write(article_txt)

                print(f"  Texto extra√≠do y guardado en {file_name.name}")

            # Agregar a la lista de todos los textos
            if article_txt.strip():
                print(article_txt)  # Solo si no est√° vac√≠o
                all_articles_text.append(article_txt)

        except Exception as e:
            print(f"  Error procesando {pdf_file.name}: {e}")

# Combinar todos los textos
combined_text = "\n\n".join(all_articles_text)

In [None]:
from langdetect import detect, LangDetectException

def is_spanish(text):
    try:
        # Si el texto es muy corto, langdetect falla o es impreciso
        if len(text.split()) < 4:
            return False
        return detect(text) == 'es'
    except LangDetectException:
        return False

# Mejora en la limpieza de texto para unir palabras partidas por guiones
def clean_text(text):
    text = re.split(r'\n\s*(Referencia|Bibliograf|Bibliography|References)', text, flags=re.IGNORECASE)[0]
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)

    lines = text.split('\n')
    clean_lines = []
    for line in lines:
        # Filtra l√≠neas que sean solo n√∫meros o muy cortas (basura de paginaci√≥n)
        if len(line.strip()) < 3 or line.strip().isdigit():
            continue
        clean_lines.append(line.strip())

    text = " ".join(clean_lines)

    # 4. Limpieza est√°ndar
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print(combined_text)
ctext = clean_text(combined_text)
sentences = nltk.tokenize.sent_tokenize(ctext, language="spanish")
print(sentences)
print(ctext)

# NER

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Reemplazo de spaCy: Cargar pipeline de Hugging Face para NER biom√©dico en espa√±ol
model_name = "HUMADEX/spanish_medical_ner"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # 2. Pasar el tokenizador expl√≠cito al pipeline
    ner_pipeline = pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="max"
    )
except Exception as e:
    print(f"Error cargando modelo: {e}")
    raise

In [None]:
entity_list = []

ENGLISH_STOPWORDS = {"the", "of", "and", "in", "to", "with", "a", "for", "study", "fistulae", "skull", "base", "flap", "nasoseptal"}
STOPWORDS.update(ENGLISH_STOPWORDS)

entity_list = []

# Tokenizar oraciones
sentences = nltk.tokenize.sent_tokenize(ctext, language="spanish")

for s in sentences:
    # 1. FILTRO DE IDIOMA: Si la oraci√≥n no es espa√±ol, s√°ltala completamente
    if not is_spanish(s):
        continue

    if len(s) < 20: continue

    # Ejecutar NER
    results = ner_pipeline(s)
    doc = nlp(s)

    denotations = []
    seen_lemmas = set() # Para evitar duplicados en la misma oraci√≥n, pero sin romper orden interno

    for ent in results:
        if ent["score"] < 0.60: continue

        # Obtener span de spaCy para lematizaci√≥n precisa
        span = doc.char_span(ent["start"], ent["end"], alignment_mode="contract")

        if not span:
            # Fallback simple si spaCy no alinea: usar la palabra cruda
            lemma = ent["word"].strip().lower()
        else:
            # --- CORRECCI√ìN CR√çTICA AQU√ç ---
            # NO usar set(). Usar lista para mantener orden: "mucosa nasal" != "nasal mucosa"
            lemma_parts = [t.lemma_.lower() for t in span]
            lemma = " ".join(lemma_parts)
            lemma = unidecode(lemma) # Quitar acentos para normalizar grafo

        # Limpieza final del lema
        lemma = re.sub(r'[^\w\s]', '', lemma).strip() # Quitar puntuaci√≥n

        # Filtros de calidad del nodo
        if (len(lemma) < 4 or
            lemma in STOPWORDS or
            lemma in BLACKLIST or
            lemma.isdigit() or
            any(w in ENGLISH_STOPWORDS for w in lemma.split())): # Si contiene palabras en ingl√©s
            continue

        label = ent["entity_group"]

        # Evitar agregar el mismo nodo dos veces para la misma oraci√≥n
        if lemma not in seen_lemmas:
            denotations.append({
                "obj": label,
                "span": {"begin": ent["start"], "end": ent["end"]},
                "lemma": lemma, # Ahora es "cirugia endoscopica", no "endoscopica cirugia"
                "original": ent["word"]
            })
            seen_lemmas.add(lemma)

    if denotations:
        entity_list.append({"text": s, "denotations": denotations})

print(entity_list)

In [None]:
parsed_entities = []
for entities_in_sentence in entity_list:
    e = []
    # If there are not entities in the text
    if not entities_in_sentence.get("denotations"):
        parsed_entities.append(
            {
                "text": entities_in_sentence["text"],
                "text_sha256": hashlib.sha256(
                    entities_in_sentence["text"].encode("utf-8")
                ).hexdigest(),
            }
        )
        continue
    for entity_denotation in entities_in_sentence["denotations"]:
        # The 'denotations' created above do not have an 'id' list.
        # The 'lemma' field is intended to be the unique identifier for the graph node.
        entity_id = entity_denotation["lemma"] # Use lemma as the ID
        other_ids = [] # No other IDs are being generated in this pipeline currently.
        entity_type = entity_denotation["obj"]

        # Use the already normalized lemma from the denotation for the entity name in the graph
        processed_lemma_for_graph = entity_denotation["lemma"]

        # Apply final filtering steps to this processed_lemma
        if processed_lemma_for_graph in STOPWORDS or len(processed_lemma_for_graph) < 3 or len(processed_lemma_for_graph) > 60 or processed_lemma_for_graph.isdigit() or not re.search(r'[a-z√°√©√≠√≥√∫√±]', processed_lemma_for_graph, re.I):
            continue

        e.append(
            {
                "entity_id": entity_id, # This is the normalized lemma
                "other_ids": other_ids, # This will be empty
                "entity_type": entity_type,
                "entity": processed_lemma_for_graph, # Use the consistent lemma for the entity name in the graph
                "start": entity_denotation["span"]["begin"], # Add start index
                "end": entity_denotation["span"]["end"]    # Add end index
            }
        )

    parsed_entities.append(
        {
            "entities": e,
            "text": entities_in_sentence["text"],
            "text_sha256": hashlib.sha256(entities_in_sentence["text"].encode("utf-8")).hexdigest(),
        }
    )


# Construccion del Grafo de Conocimiento

In [None]:
def check_dependency(token1, token2):
    """
    Retorna True si hay una conexi√≥n gramatical fuerte entre dos tokens.
    """
    # 1. Uno es ancestro del otro (ej: "tumor" -> "cerebral")
    if token1 in token2.ancestors or token2 in token1.ancestors:
        return True

    # 2. Comparten el mismo padre inmediato y ese padre es VERBO o AUXILIAR
    # Ej: "La resecci√≥n (Sujeto) ELIMIN√ì (Verbo) el tumor (Objeto)"
    if token1.head == token2.head and token1.head.pos_ in ["VERB", "AUX"]:
        return True

    return False

# Asumiendo que nlp, check_dependency y parsed_entities est√°n definidos previamente

G = nx.Graph()

print("üï∏Ô∏è Construyendo grafo basado en sintaxis...")

for item in parsed_entities:
    sentence_text = item["text"]
    entities = item["entities"]

    # Analizamos la oraci√≥n con spaCy
    doc = nlp(sentence_text)

    spacy_entities = []

    # 1. Alinear entidades del NER (caracteres) con Tokens de spaCy
    for ent in entities:
        # char_span crea un Span de spaCy basado en indices de caracteres
        # alignment_mode="contract" asegura que no rompamos palabras a la mitad
        span = doc.char_span(ent["start"], ent["end"], alignment_mode="contract")

        if span:
            spacy_entities.append({
                "span": span,     # El objeto Span de spaCy
                "lemma": ent["entity_id"], # El nombre para el nodo del grafo
                "type": ent["entity_type"]
            })

    # 2. Evaluar pares posibles
    if len(spacy_entities) < 2:
        continue

    pairs = combinations(spacy_entities, 2)

    for e1, e2 in pairs:
        # Obtenemos la ra√≠z sint√°ctica de la entidad (ej: en "cancer de pulm√≥n", la ra√≠z es "cancer")
        root1 = e1["span"].root
        root2 = e2["span"].root

        # VERIFICACI√ìN SINT√ÅCTICA o co-ocurrencia
        add_edge = False
        weight_increment = 1

        if check_dependency(root1, root2):
            add_edge = True
            weight_increment = 2  # Mayor peso para dependencias sint√°cticas
        elif abs(root1.i - root2.i) < 5:  # Co-ocurrencia si ra√≠ces est√°n cerca (menos de 5 tokens de distancia)
            add_edge = True

        if add_edge:
            source = e1["lemma"]
            target = e2["lemma"]

            # A√±adir al grafo
            if G.has_edge(source, target):
                G[source][target]["weight"] += weight_increment
            else:
                G.add_edge(source, target, weight=weight_increment)

print(f"‚úÖ Grafo construido: {G.number_of_nodes()} nodos y {G.number_of_edges()} conexiones.")


print(f"Grafo original: {G.number_of_nodes()} nodos.")

# 1. Eliminar nodos aislados o con muy pocas conexiones (Ruido)
min_degree = 1
nodes_to_remove_degree = [node for node, degree in dict(G.degree()).items() if degree < min_degree]

# Filtrado adicional por betweenness centrality para eliminar ruido
betweenness = nx.betweenness_centrality(G)
nodes_to_remove_betweenness = [node for node in G if betweenness[node] < 0.01]

nodes_to_remove = set(nodes_to_remove_degree).union(nodes_to_remove_betweenness)
G.remove_nodes_from(nodes_to_remove)

print(f"Grafo filtrado (Grado >= {min_degree} y Betweenness >= 0.01): {G.number_of_nodes()} nodos.")

# 2. Guardar GEXF para Gephi
nx.write_gexf(G, "knowledge_graph_syntax.gexf")
print("üíæ Guardado archivo 'knowledge_graph_syntax.gexf'.")

# 3. Guardar lista de nodos
with open("nodos.txt", "w", encoding="utf-8") as f:
    for node in G.nodes():
        f.write(str(node) + "\n")

# 4. Visualizaci√≥n R√°pida
if G.number_of_nodes() > 0:
    plt.figure(figsize=(16, 12))
    # Layout spring suele funcionar bien para visualizar clusters
    pos = nx.spring_layout(G, k=0.15, iterations=20)

    # Tama√±o de nodos basado en grado
    d = dict(G.degree)
    node_sizes = [v * 20 for v in d.values()]

    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color="#6495ED", alpha=0.7)
    nx.draw_networkx_edges(G, pos, alpha=0.2)
    # Solo ponemos etiquetas a los nodos m√°s importantes para no saturar
    labels = {n: n for n in G.nodes()}
    nx.draw_networkx_labels(G, pos, labels=labels, font_size=8, font_color="black")

    plt.title("Grafo de Conocimiento (Filtrado por Sintaxis)")
    plt.axis("off")
    plt.savefig("grafo_sintactico.png", dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("‚ö†Ô∏è El grafo qued√≥ vac√≠o despu√©s del filtrado. Intenta bajar la exigencia del score NER o el grado m√≠nimo.")

In [None]:
with open("nodos.txt", "w", encoding="utf-8") as f:
    for node in G.nodes():
        f.write(str(node) + "\n")

# Validacion del grafo

In [None]:
import os
import requests
import networkx as nx
from difflib import get_close_matches
from owlready2 import get_ontology, World


# ==========================================
# CLASE 1: OntologyLoader
# ==========================================
class OntologyLoader:
    def __init__(self, cache_dir="./cache_ontologies"):
        self.cache_dir = cache_dir
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
        self.world = World()

    def load_from_url(self, url, filename=None):
        """Descarga y carga una ontolog√≠a desde una URL, usando cach√© local."""
        if not filename:
            filename = url.split('/')[-1]
            if not filename.endswith('.owl') and not filename.endswith('.ttl'):
                filename += '.owl'

        local_path = os.path.join(self.cache_dir, filename)

        if not os.path.exists(local_path):
            print(f"‚¨áÔ∏è Descargando ontolog√≠a desde {url}...")
            try:
                response = requests.get(url, timeout=60, stream=True)
                response.raise_for_status()
                with open(local_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"‚úÖ Guardado en {local_path}")
            except Exception as e:
                print(f"‚ùå Error descargando ontolog√≠a: {e}")
                return None
        else:
            print(f"üìÇ Cargando ontolog√≠a desde cach√©: {local_path}")

        try:
            onto = self.world.get_ontology(local_path).load()
            return onto
        except Exception as e:
            print(f"‚ö†Ô∏è Error cargando ontolog√≠a con owlready2: {e}")
            return None

    def get_term_labels(self, ontology):
        """Extrae todas las etiquetas (labels) para matching difuso."""
        labels = set()
        if not ontology:
            return labels

        for c in ontology.classes():
            labels.add(c.name.lower())
            if hasattr(c, 'label'):
                for l in c.label:
                    labels.add(str(l).lower())
        return labels

    def get_relationships(self, ontology):
        """Extrae relaciones v√°lidas (object properties) de la ontolog√≠a."""
        relationships = set()
        if not ontology:
            return relationships

        for prop in ontology.object_properties():
            relationships.add(prop.name.lower())
            if hasattr(prop, 'label'):
                for l in prop.label:
                    relationships.add(str(l).lower())
        return relationships

ontologies_map = {
    "HPO (Phenotypes)": "http://purl.obolibrary.org/obo/hp.owl",
    "DoCS": "http://purl.obolibrary.org/obo/doid/translations/doid-es.owl",
    "NCIT (Cancer/Bio)": "http://purl.obolibrary.org/obo/ncit.owl",
    "HPO (Espa√±ol)": "http://purl.obolibrary.org/obo/hp/hp-international.owl",
    "DOID (Enfermedades - Espa√±ol)": "http://purl.obolibrary.org/obo/doid/translations/doid-es.owl"
}

In [None]:
class GraphValidator:
    def __init__(self, ontology_urls=None):
        self.loader = OntologyLoader()
        self.known_terms = set()
        self.known_relationships = set()
        self.ontologies = []
        self.ontology_labels = {}  # NUEVO: para mapear t√©rminos a ontolog√≠as

        if ontology_urls:
            for url in ontology_urls:
                onto = self.loader.load_from_url(url)
                if onto:
                    self.ontologies.append(onto)
                    print("‚öôÔ∏è Indexando t√©rminos y relaciones...", end="\r")
                    onto_labels = self.loader.get_term_labels(onto)
                    self.known_terms.update(onto_labels)
                    self.known_relationships.update(self.loader.get_relationships(onto))

                    # NUEVO: Guardar qu√© t√©rminos pertenecen a esta ontolog√≠a
                    onto_name = url.split('/')[-1].split('.')[0]
                    for term in onto_labels:
                        if term not in self.ontology_labels:
                            self.ontology_labels[term] = set()
                        self.ontology_labels[term].add(onto_name)

                    print(f"‚úÖ {len(self.known_terms)} t√©rminos, {len(self.known_relationships)} relaciones indexadas.")

    def validate_term(self, term):
        """Valida si un t√©rmino existe (match exacto o difuso)."""
        term_lower = term.lower()

        if term_lower in self.known_terms:
            return {"status": "valid", "match": term_lower, "type": "exact"}

        matches = get_close_matches(term_lower, list(self.known_terms), n=1, cutoff=0.85)
        if matches:
            return {"status": "valid", "match": matches[0], "type": "fuzzy"}

        return {"status": "invalid", "match": None, "type": "none"}

    def validate_term_with_ontology(self, term):
        """Valida t√©rmino y devuelve en qu√© ontolog√≠as aparece."""
        term_lower = term.lower()
        matching_ontologies = set()

        if term_lower in self.known_terms:
            # NUEVO: Obtener las ontolog√≠as donde aparece este t√©rmino
            matching_ontologies = self.ontology_labels.get(term_lower, set())
            return {
                "status": "valid",
                "match": term_lower,
                "type": "exact",
                "ontologies": list(matching_ontologies)  # NUEVO
            }

        # Para matches difusos, tambi√©n necesitamos encontrar las ontolog√≠as
        matches = get_close_matches(term_lower, list(self.known_terms), n=1, cutoff=0.85)
        if matches:
            matched_term = matches[0]
            matching_ontologies = self.ontology_labels.get(matched_term, set())
            return {
                "status": "valid",
                "match": matched_term,
                "type": "fuzzy",
                "ontologies": list(matching_ontologies)  # NUEVO
            }

        return {
            "status": "invalid",
            "match": None,
            "type": "none",
            "ontologies": []  # NUEVO
        }

    def validate_edge(self, source, target, edge_type):
        """Valida una arista: nodos + tipo de relaci√≥n."""
        source_validation = self.validate_term(str(source))
        target_validation = self.validate_term(str(target))

        edge_type_lower = str(edge_type).lower()
        edge_valid = edge_type_lower in self.known_relationships

        if not edge_valid:
            matches = get_close_matches(edge_type_lower, list(self.known_relationships), n=1, cutoff=0.85)
            if matches:
                edge_valid = True
                edge_match = matches[0]
            else:
                edge_match = None
        else:
            edge_match = edge_type_lower

        return {
            "source": source_validation,
            "target": target_validation,
            "edge_type": {
                "status": "valid" if edge_valid else "invalid",
                "match": edge_match,
                "original": edge_type
            },
            "fully_valid": (
                source_validation["status"] == "valid" and
                target_validation["status"] == "valid" and
                edge_valid
            )
        }


    def validate_graph(self, graph):
        """Valida nodos Y aristas del grafo NetworkX."""
        report = {
            "total_nodes": graph.number_of_nodes(),
            "valid_nodes": 0,
            "invalid_nodes": 0,
            "node_details": {},
            "edge_report": { 
                "total_edges": graph.number_of_edges(),
                "valid_rels": 0, 
                "weak_rels": 0, 
                "avg_distance": 0.0, 
                "details": {} 
            },
            "ontology_coverage": {  # NUEVO: m√©tricas de cobertura
                "total_unique_terms_matched": 0,
                "terms_by_ontology_count": {},
                "term_ontology_distribution": {}
            }
        }

        # NUEVO: Para calcular cobertura entre ontolog√≠as
        all_matched_terms = set()
        ontology_term_counts = {}
        term_ontology_map = {}  # Mapa t√©rmino -> [ontolog√≠as]

        print(f"\nüîç Validando {graph.number_of_nodes()} nodos...")
        for node in graph.nodes():
            result = self.validate_term_with_ontology(str(node))
            report["node_details"][node] = result

            # NUEVO: Acumular estad√≠sticas de cobertura
            if result["status"] == "valid":
                report["valid_nodes"] += 1
                all_matched_terms.add(result["match"])

                # Contar por ontolog√≠a
                for ontology in result["ontologies"]:
                    ontology_term_counts[ontology] = ontology_term_counts.get(ontology, 0) + 1

                # Mapear t√©rmino a ontolog√≠as
                if result["match"] not in term_ontology_map:
                    term_ontology_map[result["match"]] = set()
                term_ontology_map[result["match"]].update(result["ontologies"])
            else:
                report["invalid_nodes"] += 1

        # NUEVO: Calcular m√©tricas de cobertura
        report["ontology_coverage"]["total_unique_terms_matched"] = len(all_matched_terms)
        report["ontology_coverage"]["terms_by_ontology_count"] = ontology_term_counts
        report["ontology_coverage"]["term_ontology_distribution"] = {
            term: list(ontologies)
            for term, ontologies in term_ontology_map.items()
        }

        # Resto del c√≥digo original para aristas...
        print(f"üîó Validando {graph.number_of_edges()} aristas...")
        for i, (source, target, data) in enumerate(graph.edges(data=True)):
            edge_type = data.get('relation', data.get('type', 'unknown')) # Prefer 'relation' from Mistral, fallback to 'type'
            result = self.validate_edge(source, target, edge_type)
            edge_key = (source, target, edge_type)
            report["edge_report"]["details"][edge_key] = result # Almacenar en el nuevo sub-diccionario

            if result["fully_valid"]:
                report["edge_report"]["valid_rels"] += 1 # Actualizar contador en sub-diccionario
            else:
                report["edge_report"]["weak_rels"] += 1 # Actualizar contador en sub-diccionario

            if (i + 1) % 100 == 0:
                print(f"   Procesadas {i + 1}/{report['edge_report']['total_edges']} aristas...", end="\r")

        print()  # Nueva l√≠nea

        if report["total_nodes"] > 0:
            report["node_precision"] = report["valid_nodes"] / report["total_nodes"]
        else:
            report["node_precision"] = 0.0

        # Calcular la precisi√≥n de las aristas usando los nuevos contadores
        if report["edge_report"]["total_edges"] > 0:
            report["edge_precision"] = report["edge_report"]["valid_rels"] / report["edge_report"]["total_edges"]
        else:
            report["edge_precision"] = 0.0

        return report

In [None]:
if 'G' not in locals():
    print("‚ö†Ô∏è ¬°Alerta! La variable 'G' no est√° definida.")
else:
    print(f"üöÄ Iniciando evaluaci√≥n del grafo ({G.number_of_nodes()} nodos, {G.number_of_edges()} aristas)...\n")
    print(f"{'Ontolog√≠a':<25} | {'Nodos V/I':<12} | {'Aristas V/I':<13} | {'Prec. Nodos':<12} | {'Prec. Aristas':<14}")
    print("-" * 95)

    results_summary = []

    # NUEVO: Variables para estad√≠sticas globales
    all_terms_in_graph = set(G.nodes())
    all_matched_terms_global = set()
    term_ontology_count_global = {}  # t√©rmino -> n√∫mero de ontolog√≠as donde aparece
    ontology_overlap_stats = {}  # Para cada ontolog√≠a, cu√°ntos t√©rminos √∫nicos

    for name, url in ontologies_map.items():
        try:
            validator = GraphValidator(ontology_urls=[url])
            report = validator.validate_graph(G)

            nodes_str = f"{report['valid_nodes']}/{report['invalid_nodes']}"
            edges_str = f"{report['valid_edges']}/{report['invalid_edges']}"
            node_prec = report['node_precision']
            edge_prec = report['edge_precision']

            print(f"{name:<25} | {nodes_str:<12} | {edges_str:<13} | {node_prec:>10.2%} | {edge_prec:>12.2%}")

            # NUEVO: Acumular datos globales
            ontology_overlap_stats[name] = {
                'unique_terms_matched': report['ontology_coverage']['total_unique_terms_matched'],
                'details': report['ontology_coverage']
            }

            # Acumular t√©rminos coincidentes
            for term, ontologies in report['ontology_coverage']['term_ontology_distribution'].items():
                all_matched_terms_global.add(term)
                term_ontology_count_global[term] = term_ontology_count_global.get(term, 0) + len(ontologies)

            results_summary.append({
                "ontology": name,
                "node_precision": node_prec,
                "edge_precision": edge_prec,
                "details": report
            })

        except Exception as e:
            print(f"{name:<25} | ‚ùå ERROR: {str(e)}")

    print("-" * 95)
    print("\n‚úÖ Benchmark Completado.")

    # NUEVO: Estad√≠sticas de cobertura global
    print("\nüìä ESTAD√çSTICAS DE COBERTURA GLOBAL")
    print("=" * 60)

    # 1. T√©rminos √∫nicos que coincidieron en total
    print(f"\nüîπ T√©rminos √∫nicos que coincidieron en AL MENOS una ontolog√≠a: {len(all_matched_terms_global)}")

    # 2. T√©rminos que NO aparecieron en NINGUNA ontolog√≠a
    terms_not_found = all_terms_in_graph - all_matched_terms_global
    print(f"üîπ T√©rminos NO encontrados en NINGUNA ontolog√≠a: {len(terms_not_found)}")

    if terms_not_found:
        print("   Ejemplos (primeros 10):")
        for i, term in enumerate(list(terms_not_found)[:10]):
            print(f"   {i+1}. {term}")

    # 3. Top 10 t√©rminos con m√°s apariciones en ontolog√≠as
    print(f"\nüèÜ TOP 10 t√©rminos con m√°s apariciones en ontolog√≠as:")
    sorted_terms = sorted(term_ontology_count_global.items(), key=lambda x: x[1], reverse=True)[:10]
    for i, (term, count) in enumerate(sorted_terms, 1):
        # Obtener en qu√© ontolog√≠as espec√≠ficamente aparece
        ontologies_for_term = []
        for result in results_summary:
            if term in result['details']['ontology_coverage']['term_ontology_distribution']:
                ontologies_for_term.append(result['ontology'])

        print(f"   {i}. {term} ({count} ontolog√≠as)")
        print(f"      Aparece en: {', '.join(ontologies_for_term[:3])}" +
              ("..." if len(ontologies_for_term) > 3 else ""))

    # 4. Distribuci√≥n por ontolog√≠a
    print(f"\nüìà DISTRIBUCI√ìN DE T√âRMINOS POR ONTOLOG√çA:")
    for ontology_name, stats in ontology_overlap_stats.items():
        print(f"   {ontology_name}: {stats['unique_terms_matched']} t√©rminos √∫nicos")

    # 5. Solapamiento entre ontolog√≠as (opcional)
    if len(ontology_overlap_stats) > 1:
        print(f"\nüîÑ SOLAPAMIENTO ENTRE ONTOLOG√çAS:")
        ontologies_list = list(ontology_overlap_stats.keys())
        for i in range(len(ontologies_list)):
            for j in range(i+1, len(ontologies_list)):
                onto1 = ontologies_list[i]
                onto2 = ontologies_list[j]

                # T√©rminos de cada ontolog√≠a (simplificado)
                terms1 = set(results_summary[i]['details']['ontology_coverage']['term_ontology_distribution'].keys())
                terms2 = set(results_summary[j]['details']['ontology_coverage']['term_ontology_distribution'].keys())

                common_terms = terms1.intersection(terms2)
                if common_terms:
                    print(f"   {onto1} ‚Üî {onto2}: {len(common_terms)} t√©rminos en com√∫n")

    if results_summary:
        best_result = max(results_summary, key=lambda x: (x['node_precision'] + x['edge_precision']) / 2)
        print(f"\nüîé Mejor ontolog√≠a: {best_result['ontology']}")
        print(f"   Precisi√≥n nodos: {best_result['node_precision']:.2%}")
        print(f"   Precisi√≥n aristas: {best_result['edge_precision']:.2%}")

        invalid_edges = [(k, v) for k, v in best_result['details']['edge_details'].items() if not v['fully_valid']]
        if invalid_edges:
            print(f"\n‚ö†Ô∏è Ejemplos de aristas inv√°lidas (primeras 5):")
            for (s, t, et), details in invalid_edges[:5]:
                print(f"   {s} --[{et}]--> {t}")
                if details['edge_type']['status'] == 'invalid':
                    print(f"      ‚îî‚îÄ Tipo de arista '{et}' no encontrado en ontolog√≠a")

# Aristas faltantes

In [None]:
import networkx as nx
from itertools import combinations
from mistralai import Mistral
import json
import time

class Graph:
    def __init__(self):
        self.graph= nx.Graph()
        self.cache_file = "cache_etiquetas.json"
        self.cache = self._cargar_cache()

    def add_edge(self, source, target, relation='cooccurs_with'):
        if self.graph.has_edge(source, target):
            self.graph[source][target]["weight"] += 1
        else:
            self.graph.add_edge(source, target, weight=1, relation=relation)

    def build_relations(self, entity_names):
        pairs = combinations(entity_names, 2)

        for source, target in pairs:
            self.add_edge(source, target, relation='cooccurs_with')

    def build_graph(self, parsed_entities):
        for item in parsed_entities:
            if "entities" in item:
                entity_names = [e["entity"].lower().strip() for e in item["entities"]]
                entity_names = list(set(entity_names))

                # Si hay menos de 2 entidades, no podemos hacer una conexi√≥n
                if len(entity_names) < 2:
                    continue

                self.build_relations(entity_names)


    def _cargar_cache(self):
        """Carga el archivo JSON de cach√© si existe."""
        if os.path.exists(self.cache_file):
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Aviso: No se pudo leer el cach√© ({e}). Se iniciar√° uno nuevo.")
                return {}
        return {}

    def _guardar_cache(self):
        """Guarda el estado actual del cach√© en el archivo JSON."""
        with open(self.cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.cache, f, indent=4, ensure_ascii=False)

    def _obtener_key(self, u, v):
        """Genera una clave √∫nica para identificar la arista."""
        return f"{u}-->{v}"

    def analyze_topology(self):
        print("\nüï∏Ô∏è An√°lisis Topol√≥gico:")
        n = self.graph.number_of_nodes()
        e = self.graph.number_of_edges()

        # 1. Densidad
        density = nx.density(self.graph)
        print(f"   ‚Ä¢ Densidad: {density:.4f}")

        # 2. Componentes Conectados (importante para saber si hay islas)
        if nx.is_directed(self.graph):
            n_components = nx.number_weakly_connected_components(self.graph)
        else:
            n_components = nx.number_connected_components(self.graph)
        print(f"   ‚Ä¢ Componentes Conectados: {n_components}")

        # 3. Grado Promedio
        avg_degree = sum(dict(self.graph.degree()).values()) / n
        print(f"   ‚Ä¢ Grado Promedio: {avg_degree:.2f}")

        # 4. Clustering (solo para no dirigidos o convertir primero)
        # Indica si los vecinos de un nodo tambi√©n son vecinos entre s√≠ (com√∫n en biolog√≠a)
        try:
            avg_clustering = nx.average_clustering(self.graph.to_undirected())
            print(f"   ‚Ä¢ Coeficiente de Clustering: {avg_clustering:.4f}")
        except:
            pass

        return {"density": density, "components": n_components}

    def label_edges(self, api_key, use_cache=True, batch_size=50):
        """
        Consulta etiquetas a Mistral y las persiste en cach√©.
        """
        aristas_totales = list(self.graph.edges())
        aristas_a_consultar = []
        G_etiquetado = nx.DiGraph()

        # 1. Identificar qu√© necesitamos consultar y qu√© ya tenemos
        for u, v in aristas_totales:
            key = self._obtener_key(u, v)
            if use_cache and key in self.cache:
                # Si usamos cach√© y existe, lo a√±adimos directamente
                G_etiquetado.add_edge(u, v, label=self.cache[key])
            else:
                # Si no usamos cach√© o no existe la arista, hay que preguntar a Mistral
                aristas_a_consultar.append((u, v))

        if not aristas_a_consultar:
            print("Resultado obtenido √≠ntegramente desde el cach√©.")
            return G_etiquetado

        print(f"Total: {len(aristas_totales)} | Desde Cach√©: {len(aristas_totales) - len(aristas_a_consultar)} | Pendientes: {len(aristas_a_consultar)}")

        # 2. Procesamiento por lotes (Batching)
        client = Mistral(api_key=api_key)
        for i in range(0, len(aristas_a_consultar), batch_size):
            batch = aristas_a_consultar[i : i + batch_size]
            num_lote = i // batch_size + 1
            print(f"Procesando lote {num_lote} de {((len(aristas_a_consultar)-1)//batch_size)+1}...")

            prompt_sistema = (
                "Eres un experto en ontolog√≠as. Devuelve un JSON con una etiqueta sem√°ntica "
                "corta para cada relaci√≥n [A, B]. Formato: {'relaciones': [[A, B, 'etiqueta'], ...]}"
            )
            prompt_usuario = f"Etiqueta estas aristas: {json.dumps(batch)}"

            try:
                chat_response = client.chat.complete(
                    model="mistral-small-latest",
                    messages=[
                        {"role": "system", "content": prompt_sistema},
                        {"role": "user", "content": prompt_usuario},
                    ],
                    response_format={"type": "json_object"}
                )

                datos_lote = json.loads(chat_response.choices[0].message.content)

                # Procesar resultados del lote
                for item in datos_lote.get("relaciones", []):
                    if len(item) == 3:
                        u_res, v_res, etiqueta = item

                        # CORRECCI√ìN: Calcular key_res ANTES de usarlo
                        key_res = self._obtener_key(u_res, v_res)

                        # CORRECCI√ìN: A√±adir al grafo final usando u_res y v_res (del item)
                        G_etiquetado.add_edge(u_res, v_res, relation=etiqueta)

                        # Actualizar cach√©
                        self.cache[key_res] = etiqueta

                # Persistencia inmediata tras cada lote exitoso
                self._guardar_cache()

                # Peque√±a pausa para respetar l√≠mites de la API
                if i + batch_size < len(aristas_a_consultar):
                    time.sleep(1)

            except Exception as e:
                print(f"Error en el lote {num_lote}: {e}")
                continue

        print(f"Proceso finalizado. Grafo resultante con {G_etiquetado.number_of_edges()} aristas.")
        return G_etiquetado

    def run(self, parsed_entities):
        self.build_graph(parsed_entities)

### Clase ExtendedGraph

In [None]:
import pandas as pd
import networkx as nx
import torch
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
from pykeen.predict import predict_target

class ExtendedGraph(Graph):
    def __init__(self, graph):
        super().__init__()
        self.graph = graph.graph.copy()
        self.predicted_edges = []

    def predict_edges(self, relacion_busqueda="relacionado_con", n_predicciones=10, epochs=100):
        """
        Entrena un modelo de Knowledge Graph Embedding y a√±ade nuevas aristas al grafo original.
        Modifica el grafo actual y almacena las aristas predichas en self.predicted_edges.

        Args:
            relacion_busqueda (str): Tipo de relaci√≥n a predecir. Por defecto "relacionado_con".
            n_predicciones (int): N√∫mero de predicciones top a considerar por nodo. Por defecto 10.
            epochs (int): N√∫mero de √©pocas para entrenar el modelo. Por defecto 100.
        """
        triples_list = []
        for u, v, data in self.graph.edges(data=True):
            rel = data.get('relation', 'relacionado_con')
            triples_list.append([str(u), str(rel), str(v)])

        df_triples = pd.DataFrame(triples_list, columns=['head', 'relation', 'tail'])

        tf = TriplesFactory.from_labeled_triples(triples=df_triples.values)
        training_factory, testing_factory = tf.split([0.8, 0.2], random_state=42)

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Usando dispositivo: {device}")
        print(f"Entrenando modelo para {len(self.graph.nodes)} nodos...")

        result = pipeline(
            training=training_factory,
            testing=testing_factory,
            model='RotatE',
            epochs=epochs,
            device=device,
            random_seed=42
        )

        for u, v in self.graph.edges():
            self.graph[u][v]['origin'] = 'real'

        print("Generando predicciones de nuevas conexiones...")

        self.predicted_edges = []

        for nodo in self.graph.nodes():
            try:
                predicciones = predict_target(
                    model=result.model,
                    head=str(nodo),
                    relation=relacion_busqueda,
                    triples_factory=tf
                ).df

                top_preds = predicciones.sort_values(by='score', ascending=False).head(n_predicciones)

                for _, row in top_preds.iterrows():
                    target = row['tail_label']
                    score = row['score']

                    if not self.graph.has_edge(nodo, target) and nodo != target:
                        self.graph.add_edge(nodo, target,
                                           relation=relacion_busqueda,
                                           origin='predicha',
                                           weight=score)
                        self.predicted_edges.append({
                            'source': nodo,
                            'target': target,
                            'relation': relacion_busqueda,
                            'score': score
                        })
            except Exception as e:
                continue

        print(f"¬°Proceso completado! Aristas totales: {self.graph.number_of_edges()}")
        print(f"Aristas predichas a√±adidas: {len(self.predicted_edges)}")

        return result

    def print_img(self, remove_outliers=True, img_name="extended_graph.png"):
        """
        Imprime el grafo enriquecido, diferenciando entre aristas reales y predichas.

        Args:
            remove_outliers (bool): Si es True, elimina nodos con grado 1 para mejor visualizaci√≥n.
            img_name (str): Nombre del archivo de imagen donde se guardar√° el grafo.
        """
        import matplotlib.pyplot as plt

        G_viz = self.graph.copy()

        if remove_outliers:
            low_degree_nodes = [node for node, degree in dict(G_viz.degree()).items() if degree <= 1]
            G_viz.remove_nodes_from(low_degree_nodes)

        pos = nx.spring_layout(G_viz, seed=42)

        plt.figure(figsize=(12, 12))

        nx.draw_networkx_nodes(G_viz, pos, node_size=300, node_color='lightblue')

        real_edges = [(u, v) for u, v, d in G_viz.edges(data=True) if d.get('origin') == 'real']
        nx.draw_networkx_edges(G_viz, pos, edgelist=real_edges, edge_color='green', label='Real', width=2)

        predicted_edges = [(u, v) for u, v, d in G_viz.edges(data=True) if d.get('origin') == 'predicha']
        nx.draw_networkx_edges(G_viz, pos, edgelist=predicted_edges, edge_color='red', style='dashed', label='Predicha', width=2)

        nx.draw_networkx_labels(G_viz, pos, font_size=10)

        plt.title("Grafo Enriquecido con Predicciones de Conexiones", fontsize=15)
        plt.legend(scatterpoints=1)
        plt.axis('off')
        plt.savefig(img_name)
        plt.show()


    def print_predictions(self):
        """
        Imprime las aristas que fueron predichas por el modelo.
        """
        if not self.predicted_edges:
            print("No hay aristas predichas almacenadas.")
            return

        print(f"\nAristas Predichas ({len(self.predicted_edges)} total):")
        print("-" * 80)
        for edge in self.predicted_edges:
            print(f"{edge['source']} --({edge['relation']}, score: {edge['score']:.4f})--> {edge['target']}")
        print("-" * 80)

    def get_predicted_edges_dataframe(self):
        """
        Retorna un DataFrame de pandas con las aristas predichas.

        Returns:
            pd.DataFrame: DataFrame con columnas 'source', 'target', 'relation', 'score'
        """
        if not self.predicted_edges:
            return pd.DataFrame(columns=['source', 'target', 'relation', 'score'])
        return pd.DataFrame(self.predicted_edges)

    def get_extended_graph(self):
        """
        Retorna un nuevo objeto Graph que contiene el grafo extendido con todas las aristas
        (originales + predichas).

        Returns:
            Graph: Nuevo objeto Graph con todas las aristas incluidas.
        """
        extended_graph_obj = Graph()
        extended_graph_obj.graph = self.graph.copy()
        return extended_graph_obj

    def export_predicted_edges_img(self, remove_outliers=True, img_name="predicted_edges_only.png"):
        """
        Exporta una imagen mostrando √∫nicamente las aristas predichas por el modelo.

        Args:
            remove_outliers (bool): Si es True, elimina nodos con grado 1 para mejor visualizaci√≥n.
            img_name (str): Nombre del archivo de imagen donde se guardar√° el grafo.
        """
        import matplotlib.pyplot as plt

        G_predicted = nx.DiGraph()

        for u, v, data in self.graph.edges(data=True):
            if data.get('origin') == 'predicha':
                G_predicted.add_edge(u, v, **data)

        if G_predicted.number_of_edges() == 0:
            print("No hay aristas predichas para visualizar.")
            return

        if remove_outliers:
            low_degree_nodes = [node for node, degree in dict(G_predicted.degree()).items() if degree <= 1]
            G_predicted.remove_nodes_from(low_degree_nodes)

        if G_predicted.number_of_nodes() == 0:
            print("No hay nodos para visualizar despu√©s de remover outliers.")
            return

        pos = nx.spring_layout(G_predicted, seed=42, k=2, iterations=50)

        plt.figure(figsize=(14, 10))

        nx.draw_networkx_nodes(G_predicted, pos, node_size=500, node_color='lightcoral', alpha=0.9)

        edges = list(G_predicted.edges(data=True))
        weights = [d.get('weight', 0.5) for _, _, d in edges]

        nx.draw_networkx_edges(
            G_predicted,
            pos,
            edge_color=weights,
            edge_cmap=plt.cm.Reds,
            width=2,
            arrows=True,
            arrowsize=20,
            arrowstyle='->',
            connectionstyle='arc3,rad=0.1'
        )

        nx.draw_networkx_labels(G_predicted, pos, font_size=10, font_weight='bold')

        edge_labels = {(u, v): f"{d.get('weight', 0):.3f}" for u, v, d in G_predicted.edges(data=True)}
        nx.draw_networkx_edge_labels(G_predicted, pos, edge_labels, font_size=8)

        plt.title(f"Aristas Predichas por el Modelo\n({G_predicted.number_of_edges()} conexiones predichas)",
                  fontsize=16, fontweight='bold')

        sm = plt.cm.ScalarMappable(cmap=plt.cm.Reds, norm=plt.Normalize(vmin=min(weights), vmax=max(weights)))
        sm.set_array([])
        plt.colorbar(sm, label='Score de Confianza', ax=plt.gca(), shrink=0.8)

        plt.axis('off')
        plt.tight_layout()
        plt.savefig(img_name, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"Imagen exportada exitosamente: {img_name}")
        print(f"Nodos en el grafo: {G_predicted.number_of_nodes()}")
        print(f"Aristas predichas: {G_predicted.number_of_edges()}")

    def export_predicted_edges_txt(self, txt_name="predicted_edges.txt", format="detailed"):
        """
        Exporta las aristas predichas a un archivo de texto.

        Args:
            txt_name (str): Nombre del archivo de texto donde se guardar√°n las aristas.
            format (str): Formato de exportaci√≥n. Opciones:
                - "detailed": Formato detallado con toda la informaci√≥n
                - "simple": Formato simple (source -> target)
                - "csv": Formato CSV separado por comas
                - "tsv": Formato TSV separado por tabulaciones
        """
        if not self.predicted_edges:
            print("No hay aristas predichas para exportar.")
            return

        try:
            with open(txt_name, 'w', encoding='utf-8') as f:
                if format == "detailed":
                    f.write("=" * 80 + "\n")
                    f.write("ARISTAS PREDICHAS POR EL MODELO\n")
                    f.write("=" * 80 + "\n")
                    f.write(f"Total de aristas predichas: {len(self.predicted_edges)}\n")
                    f.write("=" * 80 + "\n\n")

                    for i, edge in enumerate(self.predicted_edges, 1):
                        f.write(f"{i}. {edge['source']} --({edge['relation']}, score: {edge['score']:.4f})--> {edge['target']}\n")

                elif format == "simple":
                    for edge in self.predicted_edges:
                        f.write(f"{edge['source']} -> {edge['target']}\n")

                elif format == "csv":
                    f.write("source,target,relation,score\n")
                    for edge in self.predicted_edges:
                        f.write(f"{edge['source']},{edge['target']},{edge['relation']},{edge['score']:.4f}\n")

                elif format == "tsv":
                    f.write("source\ttarget\trelation\tscore\n")
                    for edge in self.predicted_edges:
                        f.write(f"{edge['source']}\t{edge['target']}\t{edge['relation']}\t{edge['score']:.4f}\n")

                else:
                    print(f"Formato '{format}' no reconocido. Usando formato 'detailed'.")
                    f.write("=" * 80 + "\n")
                    f.write("ARISTAS PREDICHAS POR EL MODELO\n")
                    f.write("=" * 80 + "\n")
                    f.write(f"Total de aristas predichas: {len(self.predicted_edges)}\n")
                    f.write("=" * 80 + "\n\n")

                    for i, edge in enumerate(self.predicted_edges, 1):
                        f.write(f"{i}. {edge['source']} --({edge['relation']}, score: {edge['score']:.4f})--> {edge['target']}\n")

            print(f"Aristas predichas exportadas exitosamente a: {txt_name}")
            print(f"Formato: {format}")
            print(f"Total de aristas: {len(self.predicted_edges)}")

        except Exception as e:
            print(f"Error al exportar las aristas predichas: {e}")



In [None]:
g = Graph()
g.graph = G
print("Esta es la evaluacion de la topologia del grafo")
print(g.analyze_topology())

top_nodes = sorted(G.degree, key=lambda x: x[1], reverse=True)[:10]

print("\nüëë Top 10 Nodos (Hubs) del Grafo:")
for node, degree in top_nodes:
    print(f"   ‚Ä¢ {node}: {degree} conexiones")

### Predecir aristas faltantes de $G$

In [None]:
graph = Graph()
graph.graph = G
extended_graph = ExtendedGraph(graph)
result = extended_graph.predict_edges()



In [None]:
metrics = result.metric_results.to_df()
print("\nüìä M√©tricas de Calidad del Modelo KGE:")
print(metrics)

In [None]:
extended_graph.export_predicted_edges_img()

## Comparar G con Grafo extendido

In [None]:
import os
import requests
import networkx as nx
from difflib import get_close_matches
from owlready2 import get_ontology, World

# 2. Verificaci√≥n de variables de entrada
if 'G' not in locals() or 'extended_graph' not in locals():
    print("‚ùå Error: Debes tener definidos 'G' (original) y 'extended_graph' (con atributo .graph).")
else:
    G_original = G
    G_extended = extended_graph.graph

    print("\n" + "="*80)
    print("üìä COMPARACI√ìN: Grafo Original vs Grafo Extendido")
    print("="*80)

    comparison_results = []

    print(f"{'Ontolog√≠a':<25} | {'Aristas Originales (V/I)':<25} | {'Aristas Extendidas (V/I)':<25} | {'Mejora'}")
    print("-" * 95)

    for name, url in ontologies_map.items():
        try:
            validator = GraphValidator(ontology_urls=[url])

            # Validar ambos grafos
            rep_orig = validator.validate_graph(G_original)
            rep_ext = validator.validate_graph(G_extended)

            # Acceder a los datos del edge_report
            edge_report_orig = rep_orig['edge_report']
            edge_report_ext = rep_ext['edge_report']

            # C√°lculo de mejora en aristas v√°lidas
            improvement = edge_report_ext['valid_rels'] - edge_report_orig['valid_rels']
            improvement_pct = (improvement / max(1, edge_report_orig['valid_rels'])) * 100 if edge_report_orig['valid_rels'] > 0 else (100 if improvement > 0 else 0)
            symbol = "‚úÖ" if improvement > 0 else ("‚ûñ" if improvement == 0 else "‚ùå")

            # Formateo de tabla
            orig_str = f"{edge_report_orig['valid_rels']}/{edge_report_orig['weak_rels']}"
            ext_str = f"{edge_report_ext['valid_rels']}/{edge_report_ext['weak_rels']}"
            imp_str = f"{symbol} {improvement:+d} ({improvement_pct:+.1f}%)"

            print(f"{name:<25} | {orig_str:<25} | {ext_str:<25} | {imp_str:<15}")

            comparison_results.append({
                "ontology": name,
                "original_valid_edges": edge_report_orig['valid_rels'],
                "original_invalid_edges": edge_report_orig['weak_rels'],
                "extended_valid_edges": edge_report_ext['valid_rels'],
                "extended_invalid_edges": edge_report_ext['weak_rels'],
                "improvement_absolute": improvement,
                "improvement_percentage": improvement_pct
            })

        except Exception as e:
            print(f"{name:<25} | ‚ùå ERROR: {str(e)}")

    print("-" * 95)
    # An√°lisis detallado de las aristas predichas
    if comparison_results:
        # Encontrar la ontolog√≠a con la mayor mejora en aristas v√°lidas
        best_improvement = -float('inf')
        best_ontology_name = None
        best_ontology_report = None

        for res in comparison_results:
            if res['improvement_absolute'] > best_improvement:
                best_improvement = res['improvement_absolute']
                best_ontology_name = res['ontology']
                # Need to re-validate the extended graph with this specific validator for details
                # This is a bit inefficient but ensures we use the correct ontology for details
                current_validator = GraphValidator(ontology_urls=[ontologies_map[res['ontology']]])
                best_ontology_report = current_validator.validate_graph(G_extended)

        if best_ontology_name:
            print(f"\nüèÜ Mayor mejora en aristas v√°lidas: {best_ontology_name}")

        # Obtener aristas nuevas (solo las que son predichas y a√±adidas al grafo extendido)
        new_edges_in_extended = []
        for u, v, data in G_extended.edges(data=True):
            if data.get('origin') == 'predicha': # Asumiendo que 'origin' es una propiedad que marca las aristas predichas
                new_edges_in_extended.append((u, v, data.get('relation', 'unknown')))

        if new_edges_in_extended:
            strong_new_edges_count = 0
            if best_ontology_report: # Use the validator from the best ontology for detailed check
                # Extract the validator instance from the context that produced best_ontology_report
                # This is a bit indirect, but we need an actual validator object
                # Re-initializing is simpler for this example.
                detailed_validator = GraphValidator(ontology_urls=[ontologies_map[best_ontology_name]])

                for u, v, rel in new_edges_in_extended:
                    edge_validation_result = detailed_validator.validate_edge(u, v, rel)
                    if edge_validation_result['fully_valid']:
                        strong_new_edges_count += 1

            print(f"\nüîç An√°lisis de {len(new_edges_in_extended)} aristas predichas a√±adidas:")
            print(f"   ‚Ä¢ Validadas ontol√≥gicamente (por {best_ontology_name}): {strong_new_edges_count} ({(strong_new_edges_count/len(new_edges_in_extended))*100:.1f}%)")
        else:
            print("\nüîç No se encontraron aristas predichas a√±adidas al grafo extendido para analizar.")

# Etiquetar aristas

In [None]:
# Hazte tu api key de mistral en https://admin.mistral.ai/organization/api-keys para etiquetar las aristas
api_key = "2e7t2KjNXDm6V3eB1Zlr9U0W4zJkUNxU"

if api_key == "api_key":
    raise Exception("Genera tu api_key de mistral")

In [None]:
labeled_graph = Graph()
labeled_graph.graph = G
labeled_graph.label_edges(api_key, use_cache=False)
labeled_graph = ExtendedGraph(labeled_graph)
labeled_graph.predict_edges()

### Evaluar grafo etiquetado

In [None]:
import os
import requests
import networkx as nx
from difflib import get_close_matches
from owlready2 import get_ontology, World

# 2. Verificaci√≥n de variables de entrada
if 'G' not in locals() or 'extended_graph' not in locals():
    print("‚ùå Error: Debes tener definidos 'G' (original) y 'extended_graph' (con atributo .graph).")
else:
    G_original = G
    G_extended = labeled_graph.graph

    print("\n" + "="*80)
    print("üìä COMPARACI√ìN: Grafo Original vs Grafo Extendido")
    print("="*80)

    comparison_results = []

    print(f"{'Ontolog√≠a':<25} | {'Aristas Originales (V/I)':<25} | {'Aristas Extendidas (V/I)':<25} | {'Mejora'}")
    print("-" * 95)

    for name, url in ontologies_map.items():
        try:
            validator = GraphValidator(ontology_urls=[url])

            # Validar ambos grafos
            rep_orig = validator.validate_graph(G_original)
            rep_ext = validator.validate_graph(G_extended)

            # Acceder a los datos del edge_report
            edge_report_orig = rep_orig['edge_report']
            edge_report_ext = rep_ext['edge_report']

            # C√°lculo de mejora en aristas v√°lidas
            improvement = edge_report_ext['valid_rels'] - edge_report_orig['valid_rels']
            improvement_pct = (improvement / max(1, edge_report_orig['valid_rels'])) * 100 if edge_report_orig['valid_rels'] > 0 else (100 if improvement > 0 else 0)
            symbol = "‚úÖ" if improvement > 0 else ("‚ûñ" if improvement == 0 else "‚ùå")

            # Formateo de tabla
            orig_str = f"{edge_report_orig['valid_rels']}/{edge_report_orig['weak_rels']}"
            ext_str = f"{edge_report_ext['valid_rels']}/{edge_report_ext['weak_rels']}"
            imp_str = f"{symbol} {improvement:+d} ({improvement_pct:+.1f}%)"

            print(f"{name:<25} | {orig_str:<25} | {ext_str:<25} | {imp_str:<15}")

            comparison_results.append({
                "ontology": name,
                "original_valid_edges": edge_report_orig['valid_rels'],
                "original_invalid_edges": edge_report_orig['weak_rels'],
                "extended_valid_edges": edge_report_ext['valid_rels'],
                "extended_invalid_edges": edge_report_ext['weak_rels'],
                "improvement_absolute": improvement,
                "improvement_percentage": improvement_pct
            })

        except Exception as e:
            print(f"{name:<25} | ‚ùå ERROR: {str(e)}")

    print("-" * 95)
    # An√°lisis detallado de las aristas predichas
    if comparison_results:
        # Encontrar la ontolog√≠a con la mayor mejora en aristas v√°lidas
        best_improvement = -float('inf')
        best_ontology_name = None
        best_ontology_report = None

        for res in comparison_results:
            if res['improvement_absolute'] > best_improvement:
                best_improvement = res['improvement_absolute']
                best_ontology_name = res['ontology']
                # Need to re-validate the extended graph with this specific validator for details
                # This is a bit inefficient but ensures we use the correct ontology for details
                current_validator = GraphValidator(ontology_urls=[ontologies_map[res['ontology']]])
                best_ontology_report = current_validator.validate_graph(G_extended)

        if best_ontology_name:
            print(f"\nüèÜ Mayor mejora en aristas v√°lidas: {best_ontology_name}")

        # Obtener aristas nuevas (solo las que son predichas y a√±adidas al grafo extendido)
        new_edges_in_extended = []
        for u, v, data in G_extended.edges(data=True):
            if data.get('origin') == 'predicha': # Asumiendo que 'origin' es una propiedad que marca las aristas predichas
                new_edges_in_extended.append((u, v, data.get('relation', 'unknown')))

        if new_edges_in_extended:
            strong_new_edges_count = 0
            if best_ontology_report: # Use the validator from the best ontology for detailed check
                detailed_validator = GraphValidator(ontology_urls=[ontologies_map[best_ontology_name]])
                for u, v, rel in new_edges_in_extended:
                    edge_validation_result = detailed_validator.validate_edge(u, v, rel)
                    if edge_validation_result['fully_valid']:
                        strong_new_edges_count += 1

            print(f"\nüîç An√°lisis de {len(new_edges_in_extended)} aristas predichas a√±adidas:")
            print(f"   ‚Ä¢ Validadas ontol√≥gicamente (por {best_ontology_name}): {strong_new_edges_count} ({(strong_new_edges_count/len(new_edges_in_extended))*100:.1f}%)")
        else:
            print("\nüîç No se encontraron aristas predichas a√±adidas al grafo extendido para analizar.")