In [2]:
# ===========================
# SECTION 1 — KG Initialization
# ===========================

from rdflib import Graph, Namespace, URIRef, RDF, RDFS, OWL, Literal
from rapidfuzz import fuzz
import re

# -----------------------------
# NAMESPACE
# -----------------------------
NELL = Namespace("http://nell-995.org/")

# -----------------------------
# LOAD YOUR GRAPH (exactly as you had it)
# -----------------------------
g = Graph()
g.bind("nell", NELL)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
SPORT_RELATIONS = {
    "athleteplayssport",
    "athleteplayssport_inv",
    "athleteplaysforteam",
    "athleteplaysforteam_inv",
    "athleteplayssportsteamposition",
    "athleteplayssportsteamposition_inv",
    "athleteflyouttosportsteamposition",
    "athleteflyouttosportsteamposition_inv",
}
# Generic entity class
g.add((NELL.Entity, RDF.type, OWL.Class))

def clean_uri(text):
    return text.replace("concept:", "").replace(":", "_").replace("/", "_")
def is_sport_relation(rel):
    rel = rel.lower()
    return rel in SPORT_RELATIONS
# Load your KB file
path = "kb_env_rl.txt"
i = 0
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        if len(parts) == 3:
            subject, obj, relation = parts
            s = URIRef(f"{clean_uri(subject)}")
            r = URIRef(f"{clean_uri(relation)}")
            o = URIRef(f"{clean_uri(obj)}")
            if not is_sport_relation(r):
                continue
            g.add((s, r, o))
            g.add((r, RDF.type, OWL.ObjectProperty))
            g.add((s, RDF.type, NELL.Entity))
            g.add((o, RDF.type, NELL.Entity))
            i += 1

g.serialize(destination="knowledge_graph.nt", format="nt")


# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def clean_uri_fragment(text):
    frag = text.replace("concept:", "").replace(":", "_").replace("/", "_").strip()
    frag = re.sub(r"\s+", "_", frag)
    frag = re.sub(r"[^\w\-_.]", "", frag)
    return frag

def text_to_uri(text, ns=NELL):
    return URIRef(f"{ns}{clean_uri_fragment(text)}")

def get_labels(node, graph):
    labels = set()

    # rdfs:label if present
    for L in graph.objects(node, RDFS.label):
        if isinstance(L, Literal):
            labels.add(str(L))

    if isinstance(node, URIRef):
        raw = str(node).split("/")[-1]  # e.g., concept_city_vegas

        # Original label
        labels.add(raw.replace("_", " "))

        # Remove concept_ prefix
        if raw.startswith("concept_"):
            labels.add(raw[len("concept_"):].replace("_", " "))

        # Remove category prefix (e.g., city_, visualizablething_, etc.)
        parts = raw.split("_", 2)
        if len(parts) >= 3:
            labels.add(parts[2].replace("_", " "))

        # Add last fragment
        labels.add(parts[-1])

        # Add space-joined fragments after category prefix if more than 2
        if len(parts) > 2:
            labels.add(" ".join(parts[1:]))
            labels.add(" ".join(parts[2:]))

    return labels

# -----------------------------
# EXACT TRIPLE CHECK
# -----------------------------
def exists_exact_triple(graph, subj_text, rel_text, obj_text):
    s = text_to_uri(subj_text)
    r = text_to_uri(rel_text)
    o = text_to_uri(obj_text)
    return (s, r, o) in graph

# -----------------------------
# FUZZY LABEL MATCH
# -----------------------------
def find_by_label(graph, text, threshold=60):  
    candidates = set(graph.subjects()) | set(graph.objects())

    results = []
    for node in candidates:
        labels = get_labels(node, graph)
        if not labels:
            continue
        best = max(fuzz.ratio(text.lower(), lab.lower()) for lab in labels)
        if best >= threshold:
            results.append((node, best))

    results.sort(key=lambda x: x[1], reverse=True)
    return results


# -----------------------------
# SIMILAR TRIPLE SEARCH
# -----------------------------
def find_similar_triples(graph, subj, rel, obj, label_threshold=85, rel_threshold=20):
    matches = {
        "exact": exists_exact_triple(graph, subj, rel, obj),
        "matched_subjects": [],
        "matched_objects": [],
        "candidate_predicates": [],
        "similar_triples": []
    }

    if matches["exact"]:
        return matches

    # subject & object fuzzy matches
    matches["matched_subjects"] = find_by_label(graph, subj, threshold=label_threshold)
    matches["matched_objects"] = find_by_label(graph, obj, threshold=label_threshold)

    # predicate fuzzy match
    rel_norm = rel.replace("_", " ").strip()
    for r in set(graph.predicates()):
        frag = str(r).split("/")[-1].replace("_", " ")
        score = fuzz.ratio(rel_norm, frag)
        if score >= rel_threshold:
            matches["candidate_predicates"].append((r, score))

    # possible similar triples
    for s_node, s_score in matches["matched_subjects"]:
        for o_node, o_score in matches["matched_objects"]:
            for r in graph.predicates(subject=s_node, object=o_node):
                frag = str(r).split("/")[-1].replace("_", " ")
                rel_score = fuzz.ratio(rel, frag)

                matches["similar_triples"].append({
                    "s": s_node,
                    "r": r,
                    "o": o_node,
                    "scores": {
                        "subject": s_score,
                        "predicate": rel_score,
                        "object": o_score
                    }
                })

    # sort by relevance
    matches["similar_triples"].sort(
        key=lambda t: sum(t["scores"].values()), 
        reverse=True
    )

    return matches

# -----------------------------
# FINAL FUNCTION TO CALL FROM SECTION 2
# -----------------------------
def check_rebel_triple_against_nell(graph, triple):
    """
    triple = { "subject": "...", "relation": "...", "object": "..." }
    """
    subj = triple["subject"]
    rel  = triple["relation"]
    obj  = triple["object"]

    result = find_similar_triples(graph, subj, rel, obj)

    return {
        "input_triple": triple,
        "exists_exact": result["exact"],
        "matched_subjects": result["matched_subjects"],
        "matched_objects": result["matched_objects"],
        "candidate_predicates": result["candidate_predicates"],
        "similar_triples": result["similar_triples"]
    }

# -----------------------------
# CHECK IF NAME / ENTITY EXISTS IN NELL
# -----------------------------
def entity_exists_exact(graph, name):
    """
    Check if an entity URI matching the cleaned name exists in the graph.
    """
    uri = text_to_uri(name)
    return (uri, None, None) in graph or (None, None, uri) in graph


def entity_exists_fuzzy(graph, name, threshold=70):
    """
    Find entities with labels similar to 'name'.
    """
    return find_by_label(graph, name, threshold=threshold)




In [None]:
from groq import Groq
import re
from flair.data import Sentence
from flair.models import SequenceTagger
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
client = Groq(

    api_key="key",

)

embedder = SentenceTransformer("all-MiniLM-L6-v2")
def embedding_relation_similarity(rebel_rel, graph):
    """
    Compare a REBEL relation to all KG relations using embedding cosine similarity.
    No normalization. 
    """
    rebel_emb = embedder.encode(rebel_rel, convert_to_tensor=True)

    results = []

    for r in set(graph.predicates()):
        kg_label = str(r).split("/")[-1].replace("_", " ").lower()
        kg_emb = embedder.encode(kg_label, convert_to_tensor=True)

        cos = util.cos_sim(rebel_emb, kg_emb).item()
        results.append({
            "kg_relation": kg_label,
            "embedding_score": cos
        })

    # order from most similar to least
    results.sort(key=lambda x: x["embedding_score"], reverse=True)
    return results
def relation(graph, subj_uri, obj_uri, candidate_relations):
    """
    Verifica quais das relações candidatas realmente existem
    entre os dois nós no KG.
    subj_uri e obj_uri já são URIRef do grafo.
    """

    found = []
    for rel_label in candidate_relations:
        # Normalize the candidate relation label
        rel_norm = rel_label.strip().lower().replace(" ", "_")
        
        # Search through unique matches
        for pred_uri in set(graph.predicates()):
            pred_frag = str(pred_uri).split("/")[-1].lower()
            
            # Check if this predicate matches the candidate
            if pred_frag == rel_norm or rel_norm in pred_frag or pred_frag in rel_norm:
                
                #verificar se esse triplo ha no grafo
                if (subj_uri, pred_uri, obj_uri) in graph:
                    found.append((pred_frag, "forward"))
                if (obj_uri, pred_uri, subj_uri) in graph:
                    found.append((pred_frag, "inverse"))

                
    
    return found
# -----------------------------------------------------------------------------
# Load Models
# -----------------------------------------------------------------------------
ner_tagger = SequenceTagger.load("ner-large")
tokenizer_rebel = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model_rebel = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")



# -----------------------------------------------------------------------------
# Cleaning
# -----------------------------------------------------------------------------
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^A-Za-z0-9\s,.!?-]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -----------------------------------------------------------------------------
# Flair NER
# -----------------------------------------------------------------------------
def extract_entities(text):
    sentence = Sentence(text)
    ner_tagger.predict(sentence)
    return [
        {"text": ent.text, "type": ent.get_label('ner').value}
        for ent in sentence.get_spans("ner")
    ]


# -----------------------------------------------------------------------------
# RELATION NORMALIZATION & ADVANCED SIMILARITY
# -----------------------------------------------------------------------------

def normalize_nell_relation(rel_uri):
    """Convert NELL predicate URI into human-readable form."""
    frag = str(rel_uri).split("/")[-1]              # athleteplayssport
    frag = re.sub(r"([a-z])([A-Z])", r"\1 \2", frag) # athlete plays sport
    frag = frag.replace("_", " ")                   # athlete plays sport
    return frag.lower().strip()


def relation_token_similarity(rel1, rel2):
    """Compute token-overlap similarity between relations."""
    r1 = set(rel1.lower().split())
    r2 = set(rel2.lower().split())
    if not r1 or not r2:
        return 0
    overlap = len(r1 & r2)
    return overlap / max(len(r1), len(r2))


def relation_similarity(rebel_rel, nell_rel_uri):
    """Hybrid similarity: fuzzy + token-based."""
    rebel_norm = rebel_rel.lower().strip()
    nell_norm = normalize_nell_relation(nell_rel_uri)

    fuzzy_score = fuzz.ratio(rebel_norm, nell_norm) / 100.0
    token_score = relation_token_similarity(rebel_norm, nell_norm)

    # Weighted combination (token overlap is more important)
    return 0.6 * token_score + 0.4 * fuzzy_score



# -----------------------------------------------------------------------------
# Parser for REBEL output (triples in one line)
# -----------------------------------------------------------------------------
def parse_rebel_output(text):
    chunks = re.split(r"\s{2,}", text.strip())

    triples = []
    i = 0
    while i + 2 < len(chunks):
        subj = chunks[i].strip()
        obj  = chunks[i+1].strip()
        rel  = chunks[i+2].strip()

        triples.append({
            "subject": subj,
            "relation": rel,
            "object": obj,
            "subject_id": None,
            "object_id": None
        })
        i += 3

    return triples


# -----------------------------------------------------------------------------
# REBEL wrapper
# -----------------------------------------------------------------------------
def extract_rebel_relations(text):
    inputs = tokenizer_rebel(text, return_tensors="pt", truncation=True)
    outputs = model_rebel.generate(
        **inputs,
        max_length=256,
        num_beams=3,
        length_penalty=1.0
    )

    decoded = tokenizer_rebel.decode(outputs[0], skip_special_tokens=True)

    print("\n===== RAW REBEL OUTPUT =====")
    print(decoded)
    print("============================\n")

    return parse_rebel_output(decoded)


# -----------------------------------------------------------------------------
# Combined pipeline
# -----------------------------------------------------------------------------
def getnlp(text):
    cleaned = clean_text(text)

    return {
        "clean_text": cleaned.lower(),
        "entities": extract_entities(cleaned),
        "relations": extract_rebel_relations(cleaned)
    }
    

""""Elton Brand is a basketball player and Ronaldo is also a basketball player.
"""


def relations(relations):
    """
    Processa todas as relações encontradas pelo REBEL
    usando o teu pipeline de validação NELL.
    """

    output_lines = []

    for tr in relations:
        subj = tr["subject"]
        rel  = tr["relation"]
        obj  = tr["object"]

        out = []
        out.append(f"\nREBEL: [{subj}] --({rel})--> [{obj}]")

        # -----------------------
        # SUBJECT CHECK
        # -----------------------
        if entity_exists_exact(g, subj):
            out.append(f"✓ Sujeito existe (exato) em NELL: {subj}")
        else:
            fuzzy_s = entity_exists_fuzzy(g, subj)
            if fuzzy_s:
                best_uri, score = fuzzy_s[0]
                out.append(f"~ Sujeito aproximado: {best_uri} (score={score})")
                subj = best_uri
            else:
                out.append(f"✗ Sujeito não encontrado: {subj}")

        # -----------------------
        # OBJECT CHECK
        # -----------------------
        if entity_exists_exact(g, obj):
            out.append(f"✓ Objeto existe (exato) em NELL: {obj}")
        else:
            fuzzy_o = entity_exists_fuzzy(g, obj)
            if fuzzy_o:
                best_uri, score = fuzzy_o[0]
                out.append(f"~ Objeto aproximado: {best_uri} (score={score})")
                obj = best_uri
            else:
                out.append(f"✗ Objeto não encontrado: {obj}")

        # -----------------------
        # TRIPLE CHECK
        # -----------------------
        check = check_rebel_triple_against_nell(g, tr)

        if check["similar_triples"]:
            out.append("~ Triplo semelhante encontrado em NELL:")
            for st in check["similar_triples"][:3]:
                out.append(f"    {st['s']} --{st['r']}--> {st['o']} (scores={st['scores']})")
        else:
            out.append("✗ Triplo não existe em NELL")
            out.append("  Tentando verificar relação funcional...")

            embed_scores = embedding_relation_similarity(rel, g)
            top_relations = [s['kg_relation'] for s in embed_scores[:5]]
            found = relation(g, subj, obj, top_relations)

            if found:
                for found_rel, direction in found:
                    out.append(f"  ✓ Relação '{found_rel}' existe ({direction})")
            else:
                out.append("  Nenhuma relação candidata encontrada. Vamos ver no groq:")

                # fallback → modelo LLM
                question = f"A relação '{subj} {rel} {obj}' é verdadeira? Sê breve, responde em pt pt, em menos de 50 palavras."
                try:
                    chat = client.chat.completions.create(
                        model="llama-3.3-70b-versatile",
                        messages=[{"role": "user", "content": question}]
                    )
                    out.append("  Groq → " + chat.choices[0].message.content)
                except:
                    out.append("Groq")

        output_lines.append("\n".join(out))

    return "\n".join(output_lines)


print("==============================================")
print("     NELL FACT-CHECKING CHATBOT     ")
print("==============================================")
print(" Escreve qualquer frase para analisar.")
print(" Escreve 'exit' para sair.")
print("----------------------------------------------")

while True:
    user_input = input("\n> ")

    if user_input.lower() in {"exit", "quit"}:
        print("A sair...")
        break

    print("\nA analisar...\n")

    result = getnlp(user_input)

    print("Entidades Detetadas:", result["entities"])
    print("\nRelações Extraídas pelo REBEL:", result["relations"])

    if not result["relations"]:
        print("\nNenhuma relação encontrada.")
        continue

    print("\nVerificação das relações com NELL:")
    print(relations(result["relations"]))
    print("----------------------------------------------")
    print(" Escreve qualquer frase para analisar.")
    print(" Escreve 'exit' para sair.")
    print("----------------------------------------------")    

2025-12-06 19:06:27,650 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
     NELL FACT-CHECKING CHATBOT     
 Escreve qualquer frase para analisar.
 Escreve 'exit' para sair.
----------------------------------------------

A analisar...


===== RAW REBEL OUTPUT =====
 Elton Brand  basketball  sport  Ronaldo  basketball  sport

Entidades Detetadas: [{'text': 'Elton Brand', 'type': 'PER'}, {'text': 'Ronaldo', 'type': 'PER'}]

Relações Extraídas pelo REBEL: [{'subject': 'Elton Brand', 'relation': 'sport', 'object': 'basketball', 'subject_id': None, 'object_id': None}, {'subject': 'Ronaldo', 'relation': 'sport', 'object': 'basketball', 'subject_id': None, 'object_id': None}]

Verificação das relações com NELL:

REBEL: [Elton Brand] --(sport)--> [basketball]
~ Sujeito aproximado: concept_athlete_elton_brand (score=100.0)
~ Objeto aproximado: concept