In [1]:
# ===========================
# SECTION 1 — KG Initialization
# ===========================

from rdflib import Graph, Namespace, URIRef, RDF, RDFS, OWL, Literal
from rapidfuzz import fuzz
import re

# -----------------------------
# NAMESPACE
# -----------------------------
NELL = Namespace("http://nell-995.org/")

# -----------------------------
# LOAD YOUR GRAPH (exactly as you had it)
# -----------------------------
g = Graph()
g.bind("nell", NELL)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
SPORT_RELATIONS = {
    "athleteplayssport",
    "athleteplayssport_inv",
    "athleteplaysforteam",
    "athleteplaysforteam_inv",
    "athleteplayssportsteamposition",
    "athleteplayssportsteamposition_inv",
    "athleteflyouttosportsteamposition",
    "athleteflyouttosportsteamposition_inv",
}
# Generic entity class
g.add((NELL.Entity, RDF.type, OWL.Class))

def clean_uri(text):
    return text.replace("concept:", "").replace(":", "_").replace("/", "_")
def is_sport_relation(rel):
    rel = rel.lower()
    return rel in SPORT_RELATIONS
# Load your KB file
path = "kb_env_rl.txt"
i = 0
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 3:
            subject, obj, relation = parts
            s = URIRef(f"{clean_uri(subject)}")
            r = URIRef(f"{clean_uri(relation)}")
            o = URIRef(f"{clean_uri(obj)}")
            if not is_sport_relation(r):
                continue
            g.add((s, r, o))
            g.add((r, RDF.type, OWL.ObjectProperty))
            g.add((s, RDF.type, NELL.Entity))
            g.add((o, RDF.type, NELL.Entity))
            i += 1

g.serialize(destination="knowledge_graph.nt", format="nt")


# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def clean_uri_fragment(text):
    frag = text.replace("concept:", "").replace(":", "_").replace("/", "_").strip()
    frag = re.sub(r"\s+", "_", frag)
    frag = re.sub(r"[^\w\-_.]", "", frag)
    return frag

def text_to_uri(text, ns=NELL):
    return URIRef(f"{ns}{clean_uri_fragment(text)}")

def get_labels(node, graph):
    labels = set()

    # rdfs:label if present
    for L in graph.objects(node, RDFS.label):
        if isinstance(L, Literal):
            labels.add(str(L))

    if isinstance(node, URIRef):
        raw = str(node).split("/")[-1]  # e.g., concept_city_vegas

        # Original label
        labels.add(raw.replace("_", " "))

        # Remove concept_ prefix
        if raw.startswith("concept_"):
            labels.add(raw[len("concept_"):].replace("_", " "))

        # Remove category prefix (e.g., city_, visualizablething_, etc.)
        parts = raw.split("_", 2)
        if len(parts) >= 3:
            labels.add(parts[2].replace("_", " "))

        # Add last fragment
        labels.add(parts[-1])

        # Add space-joined fragments after category prefix if more than 2
        if len(parts) > 2:
            labels.add(" ".join(parts[1:]))
            labels.add(" ".join(parts[2:]))

    return labels

# -----------------------------
# EXACT TRIPLE CHECK
# -----------------------------
def exists_exact_triple(graph, subj_text, rel_text, obj_text):
    s = text_to_uri(subj_text)
    r = text_to_uri(rel_text)
    o = text_to_uri(obj_text)
    return (s, r, o) in graph

# -----------------------------
# FUZZY LABEL MATCH
# -----------------------------
def find_by_label(graph, text, threshold=60):  
    candidates = set(graph.subjects()) | set(graph.objects())

    results = []
    for node in candidates:
        labels = get_labels(node, graph)
        if not labels:
            continue
        best = max(fuzz.ratio(text.lower(), lab.lower()) for lab in labels)
        if best >= threshold:
            results.append((node, best))

    results.sort(key=lambda x: x[1], reverse=True)
    return results


# -----------------------------
# SIMILAR TRIPLE SEARCH
# -----------------------------
def find_similar_triples(graph, subj, rel, obj, label_threshold=85, rel_threshold=20):
    matches = {
        "exact": exists_exact_triple(graph, subj, rel, obj),
        "matched_subjects": [],
        "matched_objects": [],
        "candidate_predicates": [],
        "similar_triples": []
    }

    if matches["exact"]:
        return matches

    # subject & object fuzzy matches
    matches["matched_subjects"] = find_by_label(graph, subj, threshold=label_threshold)
    matches["matched_objects"] = find_by_label(graph, obj, threshold=label_threshold)

    # predicate fuzzy match
    rel_norm = rel.replace("_", " ").strip()
    for r in set(graph.predicates()):
        frag = str(r).split("/")[-1].replace("_", " ")
        score = fuzz.ratio(rel_norm, frag)
        if score >= rel_threshold:
            matches["candidate_predicates"].append((r, score))

    # possible similar triples
    for s_node, s_score in matches["matched_subjects"]:
        for o_node, o_score in matches["matched_objects"]:
            for r in graph.predicates(subject=s_node, object=o_node):
                frag = str(r).split("/")[-1].replace("_", " ")
                rel_score = fuzz.ratio(rel, frag)

                matches["similar_triples"].append({
                    "s": s_node,
                    "r": r,
                    "o": o_node,
                    "scores": {
                        "subject": s_score,
                        "predicate": rel_score,
                        "object": o_score
                    }
                })

    # sort by relevance
    matches["similar_triples"].sort(
        key=lambda t: sum(t["scores"].values()), 
        reverse=True
    )

    return matches

# -----------------------------
# FINAL FUNCTION TO CALL FROM SECTION 2
# -----------------------------
def check_rebel_triple_against_nell(graph, triple):
    """
    triple = { "subject": "...", "relation": "...", "object": "..." }
    """
    subj = triple["subject"]
    rel  = triple["relation"]
    obj  = triple["object"]

    result = find_similar_triples(graph, subj, rel, obj)

    return {
        "input_triple": triple,
        "exists_exact": result["exact"],
        "matched_subjects": result["matched_subjects"],
        "matched_objects": result["matched_objects"],
        "candidate_predicates": result["candidate_predicates"],
        "similar_triples": result["similar_triples"]
    }

# -----------------------------
# CHECK IF NAME / ENTITY EXISTS IN NELL
# -----------------------------
def entity_exists_exact(graph, name):
    """
    Check if an entity URI matching the cleaned name exists in the graph.
    """
    uri = text_to_uri(name)
    return (uri, None, None) in graph or (None, None, uri) in graph


def entity_exists_fuzzy(graph, name, threshold=70):
    """
    Find entities with labels similar to 'name'.
    """
    return find_by_label(graph, name, threshold=threshold)




In [None]:
from groq import Groq
import re
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
client = Groq(

    api_key="key",

)

embedder = SentenceTransformer("all-MiniLM-L6-v2")
def embedding_relation_similarity(rebel_rel, graph):
    """
    Compare a REBEL relation to all KG relations using embedding cosine similarity.
    No normalization. 
    """
    rebel_emb = embedder.encode(rebel_rel, convert_to_tensor=True)

    results = []

    for r in set(graph.predicates()):
        kg_label = str(r).split("/")[-1].replace("_", " ").lower()
        kg_emb = embedder.encode(kg_label, convert_to_tensor=True)

        cos = util.cos_sim(rebel_emb, kg_emb).item()
        results.append({
            "kg_relation": kg_label,
            "embedding_score": cos
        })

    # order from most similar to least
    results.sort(key=lambda x: x["embedding_score"], reverse=True)
    return results
def relation(graph, subj_uri, obj_uri, candidate_relations):
    """
    Verifica quais das relações candidatas realmente existem
    entre os dois nós no KG.
    subj_uri e obj_uri já são URIRef do grafo.
    """

    print(subj_uri, obj_uri)
    found = []
    for triple in list(graph)[:5]:
        print("Sample triple in graph:", triple)
    for rel_label in candidate_relations:
        # Normalize the candidate relation label
        rel_norm = rel_label.strip().lower().replace(" ", "_")
        
        # Search through unique matches
        for pred_uri in set(graph.predicates()):
            pred_frag = str(pred_uri).split("/")[-1].lower()
            
            # Check if this predicate matches the candidate
            if pred_frag == rel_norm or rel_norm in pred_frag or pred_frag in rel_norm:
                
                #verificar se esse triplo ha no grafo
                if (subj_uri, pred_uri, obj_uri) in graph:
                    found.append((pred_frag, "forward"))
                if (obj_uri, pred_uri, subj_uri) in graph:
                    found.append((pred_frag, "inverse"))

                
    
    return found
# -----------------------------------------------------------------------------
# Load Models
# -----------------------------------------------------------------------------
ner_tagger = SequenceTagger.load("ner-large")
tokenizer_rebel = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model_rebel = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

# -----------------------------------------------------------------------------
# Cleaning
# -----------------------------------------------------------------------------
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^A-Za-z0-9\s,.!?-]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -----------------------------------------------------------------------------
# Flair NER
# -----------------------------------------------------------------------------
def extract_entities(text):
    sentence = Sentence(text)
    ner_tagger.predict(sentence)
    return [
        {"text": ent.text, "type": ent.get_label('ner').value}
        for ent in sentence.get_spans("ner")
    ]


# -----------------------------------------------------------------------------
# RELATION NORMALIZATION & ADVANCED SIMILARITY
# -----------------------------------------------------------------------------

def normalize_nell_relation(rel_uri):
    """Convert NELL predicate URI into human-readable form."""
    frag = str(rel_uri).split("/")[-1]              # athleteplayssport
    frag = re.sub(r"([a-z])([A-Z])", r"\1 \2", frag) # athlete plays sport
    frag = frag.replace("_", " ")                   # athlete plays sport
    return frag.lower().strip()


def relation_token_similarity(rel1, rel2):
    """Compute token-overlap similarity between relations."""
    r1 = set(rel1.lower().split())
    r2 = set(rel2.lower().split())
    if not r1 or not r2:
        return 0
    overlap = len(r1 & r2)
    return overlap / max(len(r1), len(r2))


def relation_similarity(rebel_rel, nell_rel_uri):
    """Hybrid similarity: fuzzy + token-based."""
    rebel_norm = rebel_rel.lower().strip()
    nell_norm = normalize_nell_relation(nell_rel_uri)

    fuzzy_score = fuzz.ratio(rebel_norm, nell_norm) / 100.0
    token_score = relation_token_similarity(rebel_norm, nell_norm)

    # Weighted combination (token overlap is more important)
    return 0.6 * token_score + 0.4 * fuzzy_score



# -----------------------------------------------------------------------------
# Parser for REBEL output (triples in one line)
# -----------------------------------------------------------------------------
def parse_rebel_output(text):
    chunks = re.split(r"\s{2,}", text.strip())

    triples = []
    i = 0
    while i + 2 < len(chunks):
        subj = chunks[i].strip()
        obj  = chunks[i+1].strip()
        rel  = chunks[i+2].strip()

        triples.append({
            "subject": subj,
            "relation": rel,
            "object": obj,
            "subject_id": None,
            "object_id": None
        })
        i += 3

    return triples


# -----------------------------------------------------------------------------
# REBEL wrapper
# -----------------------------------------------------------------------------
def extract_rebel_relations(text):
    inputs = tokenizer_rebel(text, return_tensors="pt", truncation=True)
    outputs = model_rebel.generate(
        **inputs,
        max_length=256,
        num_beams=3,
        length_penalty=1.0
    )

    decoded = tokenizer_rebel.decode(outputs[0], skip_special_tokens=True)

    print("\n===== RAW REBEL OUTPUT =====")
    print(decoded)
    print("============================\n")

    return parse_rebel_output(decoded)


# -----------------------------------------------------------------------------
# Combined pipeline
# -----------------------------------------------------------------------------
def getnlp(text):
    cleaned = clean_text(text)

    return {
        "clean_text": cleaned.lower(),
        "entities": extract_entities(cleaned),
        "relations": extract_rebel_relations(cleaned)
    }
    


# -----------------------------------------------------------------------------
# Example
# -----------------------------------------------------------------------------
article = """Cristiano Ronaldo is a basketball player
"""

result = getnlp(article)

print("\n==== ENTITIES ====")
print(result["entities"])

print("\n==== RELATIONS ====")
print(result["relations"])

print("\n==== CHECKING REBEL RELATIONS AGAINST NELL =====\n")
for tr in result["relations"]:
    print(f"\nREBEL TRIPLE: {tr}")

    subj = tr["subject"]
    rel  = tr["relation"]
    obj  = tr["object"]

    # ======================================
    # 1. CHECK SUBJECT EXISTENCE
    # ======================================
    print("\n-- SUBJECT CHECK --")
    if entity_exists_exact(g, subj):
        print(f"✓ EXACT subject match in NELL: {subj}")
    else:
        fuzzy_s = entity_exists_fuzzy(g, subj)
        if fuzzy_s:
            print(f"~ POSSIBLE subject matches:")
            for node, score in fuzzy_s[:5]:
                print(f"    {node} (score {score})")
            subj=fuzzy_s[0][0]  
        else:
            print(f"✗ No subject match for: {subj}")

    # ======================================
    # 2. CHECK OBJECT EXISTENCE
    # ======================================
    print("\n-- OBJECT CHECK --")
    if entity_exists_exact(g, obj):
        print(f"✓ EXACT object match in NELL: {obj}")
    else:
        fuzzy_o = entity_exists_fuzzy(g, obj)
        if fuzzy_o:
            print(f"~ POSSIBLE object matches:")
            for node, score in fuzzy_o[:5]:
                print(f"    {node} (score {score})")
            obj=fuzzy_o[0][0]
        else:
            print(f"✗ No object match for: {obj}")

    # ======================================
    #  CHECK TRIPLE EXISTENCE
    # ======================================
    print("\n-- TRIPLE CHECK --")
    check = check_rebel_triple_against_nell(g, tr)

    if check["similar_triples"]:
        print("~ Similar triples found:")
        for st in check["similar_triples"][:5]:
            print(f"    {st['s']}  --{st['r']}-->  {st['o']} (scores={st['scores']})")
    else:
        print("✗ Triple NOT found in NELL")
        print("---------------------")
        embed_scores = embedding_relation_similarity(rel, g)

        for s in embed_scores[:5]:
            print(f"  {s['kg_relation']}   (score={s['embedding_score']:.4f})")
        top_relations = [s["kg_relation"] for s in embed_scores[:5]]
        relat = relation(g, subj, obj, top_relations)

        if relat:
            for rel, direction in relat:
                print(f" {rel} exists between {subj} and {obj} ({direction})")
        else:
            print(" None of the top candidate relations exist between these entities.")
            request = "Please check if this is true:" + article + "Keep it simple, less than 50 words"
            chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": request,
                }
            ],
            model="llama-3.3-70b-versatile",
            )
            print(chat_completion.choices[0].message.content)
    print("\n-----------------------------------\n")


2025-12-04 15:10:54,982 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>

===== RAW REBEL OUTPUT =====
 Kevin Kolb  Eagles  member of sports team


==== ENTITIES ====
[{'text': 'Kevin Kolb', 'type': 'PER'}, {'text': 'Eagles', 'type': 'ORG'}, {'text': 'Eagles', 'type': 'ORG'}]

==== RELATIONS ====
[{'subject': 'Kevin Kolb', 'relation': 'member of sports team', 'object': 'Eagles', 'subject_id': None, 'object_id': None}]

==== CHECKING REBEL RELATIONS AGAINST NELL =====


REBEL TRIPLE: {'subject': 'Kevin Kolb', 'relation': 'member of sports team', 'object': 'Eagles', 'subject_id': None, 'object_id': None}

-- SUBJECT CHECK --
~ POSSIBLE subject matches:
    concept_athlete_kevin_kolb (score 100.0)
    concept_coach_kevin_love (score 70.0)

-- OBJECT CHECK --
~ POSSIBLE object matches:
    concept_sportsteam_eagles (score 100.0)

-- TRIPLE CHECK --
