
# 02 — Ingest PDF → Turtle → AuraDB (v7: OpenAI for Step 4b)

This version replaces the 4b classifier with **OpenAI embeddings** nearest-neighbor mapping.
Set `OPENAI_API_KEY` in your env before running.


## 0) Setup

In [11]:
!pip install -U openai PyMuPDF pandas numpy tqdm transformers sentencepiece rdflib neo4j joblib spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## 1) Config

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pathlib import Path
import os
from google.colab import files

uploaded = files.upload()  # pick a PDF from your computer
pdf_name = next(iter(uploaded))         # first selected file
PDF_PATH = Path(pdf_name)
print("Using:", PDF_PATH.resolve())

MODEL_DIR_SIMPLIFIER   = Path("./drive/MyDrive/MGR/trained_models/t5_simplifier_step2")
MODEL_DIR_TRIPLE       = Path("./drive/MyDrive/MGR/trained_models/t5_triple_step3")
MODEL_DIR_REL_CLASSIF  = Path("./drive/MyDrive/MGR/trained_models/distilbert_relation_step4a")

# OpenAI 4b artifacts
OPENAI_4B_DIR = Path("./drive/MyDrive/MGR/trained_models/openai_class_mapper_4b")
TAXO_EMB_NPY  = OPENAI_4B_DIR / "taxonomy_embeddings.npy"
TAXO_META_JSON= OPENAI_4B_DIR / "taxonomy_meta.json"

LABELMAP_DIR = Path("./drive/MyDrive/MGR/trained_models/label_maps")
REL_LABEL2ID_PATH = LABELMAP_DIR / "relation_label2id.joblib"

TAXONOMY_CSV = Path("./data/taxonomy.csv")  # for URIs

NEO4J_URI  = os.getenv("NEO4J_URI",  "neo4j+s://examole.databases.neo4j.io")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASS = os.getenv("NEO4J_PASS", "example")

BASE_NS = "http://example.org/telecom#"
PROV_NS = "http://www.w3.org/ns/prov#"

OPENAI_API_KEY="example"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

Saving ts_138300v180600p.pdf to ts_138300v180600p (3).pdf
Using: /content/ts_138300v180600p (3).pdf


## 2) Extract Telecom-Heavy Sentences from PDF

In [14]:
import fitz  # PyMuPDF
import spacy
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")

TELECOM_TRIGGERS = [
    "5g", "4g", "lte", "nr", "ran", "gNodeB", "ng-eNB", "amf", "smf", "upf", "ausf",
    "udm", "pcf", "nrf", "bsf", "nef", "network slice", "QoS", "qci", "s1ap",
    "ngap", "diameter", "sctp", "ims", "epc", "open5gs", "core network",
    "pdu session", "mme", "sgw", "pgw", "slice", "mec", "edge", "sba", "n2", "n3", "n6", "n8", "n10", "n11"
]

def extract_sentences_with_pages(pdf_path):
    doc = fitz.open(pdf_path)
    sentences = []
    for i in range(len(doc)):
        page = doc[i]
        text = page.get_text("text")
        for sent in nlp(text).sents:
            s = sent.text.strip()
            if s:
                sentences.append((i+1, s))
    return sentences

def is_telecom_heavy(text):
    low = text.lower()
    return any(tok.lower() in low for tok in TELECOM_TRIGGERS)

candidates = [(p, s) for (p, s) in extract_sentences_with_pages(PDF_PATH) if is_telecom_heavy(s)]
print(f"Extracted {len(candidates)} candidate sentences.")
print(candidates[:5])

Extracted 1985 candidate sentences.
[(1, 'ETSI TS 138 300 V18.6.0 (2025-07) \n5G; \nNR; \nNR and NG-RAN Overall description; \nStage-2  \n(3GPP TS 38.300 version 18.6.0'), (2, 'Release 18\n \nReference \nRTS/TSGR-0238300vi60 \nKeywords \n5G \nETSI \n650 Route des Lucioles \nF-06921 Sophia Antipolis Cedex - FRANCE \n \nTel.:'), (2, 'Fax: +33 4 93 65 47 16 \n \nSiret N° 348 623 562 00017 - APE 7112B \nAssociation à but non lucratif enregistrée à la \nSous-Préfecture de Grasse (06) N° w061004871 \n \nImportant notice'), (2, 'If you find errors in the present document, please send your comments to \nthe relevant service listed under Committee Support Staff.'), (2, 'No recommendation as to products and services or vendors is made or should be implied.')]


## 3) Load Trained Models (2,3,4a) & Taxonomy Embeddings (OpenAI 4b)

In [15]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast, AutoTokenizer, AutoModelForSequenceClassification
import torch, joblib, json, numpy as np, pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# T5 (2 & 3)
t5_tok = T5TokenizerFast.from_pretrained(MODEL_DIR_SIMPLIFIER)
simplifier = T5ForConditionalGeneration.from_pretrained(MODEL_DIR_SIMPLIFIER).to(device)

triple_tok = T5TokenizerFast.from_pretrained(MODEL_DIR_TRIPLE)
triple_model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR_TRIPLE).to(device)

# 4a classifier
rel_tok = AutoTokenizer.from_pretrained(MODEL_DIR_REL_CLASSIF)
rel_model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR_REL_CLASSIF).to(device)

# Taxonomy info
taxonomy_df = pd.read_csv(TAXONOMY_CSV)
taxo_uri = {row["node_id"]: row["uri"] for _, row in taxonomy_df.iterrows()}
taxo_label = {row["node_id"]: row["label"] for _, row in taxonomy_df.iterrows()}

# Embedding matrix + meta
mat = np.load(TAXO_EMB_NPY)
with open(TAXO_META_JSON, "r", encoding="utf-8") as f:
    meta = json.load(f)
node_ids = meta["node_ids"]
texts = meta["texts"]
assert mat.shape[0] == len(node_ids), "Embedding matrix vs node_ids mismatch"

## 4) Helpers

In [16]:
from typing import List
from openai import OpenAI
import numpy as np
import torch

client = OpenAI()

def t5_generate(model, tokenizer, inputs: List[str], max_len=128, prefix=None):
    model.eval()
    outs = []
    for text in inputs:
        if prefix:
            text = prefix + text
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            gen = model.generate(**enc, max_length=max_len, num_beams=4)
        outs.append(tokenizer.decode(gen[0], skip_special_tokens=True).strip())
    return outs

def classify_relations(model, tokenizer, texts: List[str]) -> List[str]:
    model.eval()
    labels = []
    for text in texts:
        enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
        with torch.no_grad():
            logits = model(**enc).logits
        pred_id = int(torch.argmax(logits, dim=-1).cpu().item())
        labels.append(model.config.id2label.get(pred_id, str(pred_id)))
    return labels

def embed_openai(texts: List[str], model="text-embedding-3-large"):
    # returns [n, d] numpy array
    vecs = []
    for i in range(0, len(texts), 256):
        batch = texts[i:i+256]
        emb = client.embeddings.create(model=model, input=batch)
        vecs.extend([e.embedding for e in emb.data])
    return np.array(vecs, dtype="float32")

def cosine_sim(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-9)
    b = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-9)
    return a @ b.T  # [n, m]

def map_to_taxonomy(objects: List[str], top_k=5, threshold=0.75):
    obj_vecs = embed_openai(objects)
    sims = cosine_sim(obj_vecs, mat)  # [len(objects), N_nodes]
    top_ids = sims.argsort(axis=1)[:, ::-1][:, :top_k]
    top_scores = np.take_along_axis(sims, top_ids, axis=1)
    results = []
    for i, s in enumerate(objects):
        idxs = top_ids[i]
        scores = top_scores[i]
        nid = node_ids[idxs[0]]
        results.append((s, nid, float(scores[0])))
    return results

## 5) Steps 2 & 3 — Simplify → Extract Triples

In [17]:
# Fast batched version of Steps 2 & 3

import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
simplifier.to(device).eval()
triple_model.to(device).eval()

def t5_generate_batched(model, tokenizer, texts, *, max_len=128, batch_size=16, num_beams=1):
    outs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating", total=(len(texts)+batch_size-1)//batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, return_tensors="pt", truncation=True, max_length=512, padding=True)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.inference_mode():
            if device.type == "cuda":
                with torch.autocast(device_type="cuda", dtype=torch.float16):
                    gen = model.generate(**enc, max_length=max_len, num_beams=num_beams)
            else:
                gen = model.generate(**enc, max_length=max_len, num_beams=num_beams)
        outs.extend(tokenizer.batch_decode(gen, skip_special_tokens=True))
    return [o.strip() for o in outs]

# (Optional) thin out candidates to test speed first
# candidates = candidates[:200]

pages, texts0 = zip(*candidates) if candidates else ([], [])
print(f"Candidates: {len(texts0)}, device: {device}")

# Step 2 (complex -> simple): shorter outputs, no beam search
simple_sentences = t5_generate_batched(
    simplifier, t5_tok, list(texts0),
    max_len=96, batch_size=16, num_beams=1
)

# Step 3 (simple -> triple): keep short outputs, no beam search
triple_strings = t5_generate_batched(
    triple_model, triple_tok, list(simple_sentences),
    max_len=48, batch_size=16, num_beams=1
)

def parse_triple(s):
    parts = [p.strip() for p in s.split("|")]
    if len(parts) != 3: return None, None, None
    return parts[0], parts[1], parts[2]

triples_raw = []
for p, t in zip(pages, triple_strings):
    a, r, b = parse_triple(t)
    if a and r and b:
        triples_raw.append((p, a, r, b))

print("Sample triples:", triples_raw[:5])
print(f"Total triples parsed: {len(triples_raw)}")

Candidates: 1985, device: cpu


Generating: 100%|██████████| 125/125 [10:43<00:00,  5.15s/it]
Generating: 100%|██████████| 125/125 [06:15<00:00,  3.00s/it]

Sample triples: [(2, 'ETSI', 'shallNotBeHonoreInAnyEventFor', 'AnyDaunt whatsoever'), (2, 'CopyrightNotificationNoParts', 'mayBeReproducedAndUsedIn', 'AnyForm or By Any Means'), (3, 'ETSISR 000314Updates', 'are', 'EssentialToThe PresentDokument'), (3, 'PLUGTESTSTM', 'is', 'trademarkOfETSIregisteredForTheTunnelOfSeconds'), (3, '3GPPTMLogo', 'isRegistratedFor', 'BenefitsOfSeconds')]
Total triples parsed: 1543





## 6) Step 4a — Normalize Relations (DistilBERT)

In [18]:
raw_relations = [r for _, _, r, _ in triples_raw]
norm_rel = classify_relations(rel_model, rel_tok, raw_relations)

def rel_to_iri(label: str) -> str:
    safe = "".join(ch if ch.isalnum() else "_" for ch in label).strip("_")
    return f"{BASE_NS}{safe}"
rel_iris = [rel_to_iri(x) for x in norm_rel]

## 7) Step 4b — Map Classes via OpenAI Embeddings

In [19]:
# --- Robust 4b mapping with top-k ranking, path boosting, thresholding, and optional GPT tie-break ---
import re
import numpy as np
import pandas as pd

TOP_K = 5
HIGH_THRESH = 0.78       # accept confidently
LOW_THRESH  = 0.70       # below this, still take best but mark low_confidence
USE_GPT_FOR_TIEBREAK = True  # requires OPENAI_API_KEY and internet access

# Build a label lookup for leaf text (strip path) to node_id for light heuristics
leaf_text_by_id = {nid: txt.split("⟂")[0].strip() if "⟂" in txt else txt for nid, txt in zip(node_ids, texts)}

def rank_taxonomy(objects, top_k=TOP_K):
    # Use the helpers from section 4 (embed_openai, cosine_sim, 'mat', 'node_ids', 'texts')
    vecs = embed_openai(objects)
    sims = cosine_sim(vecs, mat)
    top_ids = sims.argsort(axis=1)[:, ::-1][:, :top_k]
    top_scores = np.take_along_axis(sims, top_ids, axis=1)
    return top_ids, top_scores  # indices into node_ids

def boost_with_path(object_str, node_text):
    # Small boost if object tokens appear in the candidate's path text
    s = object_str.lower()
    t = node_text.lower()
    score = 0.0
    # Exact leaf match gets a bigger boost
    leaf = node_text.split("⟂")[0].strip().lower() if "⟂" in node_text else node_text.lower()
    if s == leaf:
        score += 0.06
    # Token overlap
    toks = [w for w in re.findall(r"[a-zA-Z0-9]+", s) if len(w) > 2]
    overlap = sum(1 for w in toks if w in t)
    if overlap:
        score += min(0.04, 0.01 * overlap)
    return score

def pick_best(object_str, cand_node_idxs, cand_scores):
    # Apply a tiny path-based boost before selecting
    adj_scores = []
    for idx, base in zip(cand_node_idxs, cand_scores):
        nid  = node_ids[idx]
        text = texts[idx]
        adj = base + boost_with_path(object_str, text)
        adj_scores.append(adj)
    # Return best by adjusted score
    best_pos = int(np.argmax(adj_scores))
    return cand_node_idxs[best_pos], float(adj_scores[best_pos])

def gpt_tiebreak(object_str, cand_node_idxs):
    # Ask GPT to choose the best among top-3 when confidence is marginal
    if not USE_GPT_FOR_TIEBREAK:
        return cand_node_idxs[0]
    try:
        options = [
            {"id": node_ids[i], "label": taxo_label.get(node_ids[i], node_ids[i]), "path": texts[i]}
            for i in cand_node_idxs[:3]
        ]
        prompt = (
            "Select the single best taxonomy class for the object.\n"
            f"Object: {object_str}\n\n"
            "Candidates:\n" +
            "\n".join([f"- id={o['id']} | label={o['label']} | path={o['path']}" for o in options]) +
            "\n\nReturn only the id of the best candidate."
        )
        rsp = client.responses.create(
            model="gpt-4o-mini",
            input=[{"role": "user", "content": prompt}]
        )
        txt = rsp.output_text.strip()
        # Extract the id (must match one of the candidates)
        for o in options:
            if o["id"] in txt:
                return o["id"]
        # If not found, fallback to the first
        return options[0]["id"]
    except Exception as e:
        # If API call fails, silently fallback to first
        return node_ids[cand_node_idxs[0]]

def map_objects(objects):
    top_ids, top_scores = rank_taxonomy(objects, top_k=TOP_K)
    picks = []
    for obj, idxs, scores in zip(objects, top_ids, top_scores):
        best_idx, best_score = pick_best(obj, idxs, scores)
        nid = node_ids[best_idx]
        conf = "high" if best_score >= HIGH_THRESH else ("mid" if best_score >= LOW_THRESH else "low")
        # Tie-break with GPT for mid-confidence
        if conf == "mid":
            chosen_id = gpt_tiebreak(obj, idxs)
            if chosen_id != nid:
                # If GPT picked a different id, recompute score from our sims array
                chosen_pos = list(idxs).index(node_ids.index(chosen_id)) if chosen_id in node_ids else 0
                nid = chosen_id
        picks.append((obj, nid, float(best_score), conf))
    return picks

# Run mapping
raw_a = [a for _, a, _, _ in triples_raw]
raw_b = [b for _, _, _, b in triples_raw]

mapped_a = map_objects(raw_a)
mapped_b = map_objects(raw_b)

# Build URIs and a small debug frame
def to_uri(node_id: str) -> str:
    return taxo_uri.get(node_id, f"{BASE_NS}{node_id}")

a_uris = [to_uri(nid) for (_, nid, score, conf) in mapped_a]
b_uris = [to_uri(nid) for (_, nid, score, conf) in mapped_b]

normalized = [(p, sa, pr, ob) for (p, sa, pr, ob) in zip([t[0] for t in triples_raw], a_uris, rel_iris, b_uris)]
print("Sample normalized triples:", normalized[:5])

# Write a debug report to inspect mapping quality
dbg_a = pd.DataFrame(mapped_a, columns=["object_text","node_id","score","confidence"]).assign(role="subject")
dbg_b = pd.DataFrame(mapped_b, columns=["object_text","node_id","score","confidence"]).assign(role="object")
dbg = pd.concat([dbg_a, dbg_b], ignore_index=True)
dbg["label"] = dbg["node_id"].map(lambda nid: taxo_label.get(nid, nid))
dbg.to_csv("debug_class_mapping.csv", index=False)
print("Wrote mapping debug report to debug_class_mapping.csv")

Sample normalized triples: [(2, 'http://example.org/telecom#EAP5G', 'http://example.org/telecom#isusedfor', 'http://example.org/telecom#AV'), (2, 'http://example.org/telecom#Privacy', 'http://example.org/telecom#isusedin', 'http://example.org/telecom#FormOfNon3GPPAccess'), (3, 'http://example.org/telecom#Release19TR28914', 'http://example.org/telecom#are', 'http://example.org/telecom#EssentialInPrivate5GEnvironments'), (3, 'http://example.org/telecom#Testbed', 'http://example.org/telecom#is', 'http://example.org/telecom#IPsecTunnelingForNon3GPPAccessPaths'), (3, 'http://example.org/telecom#3GPPTS29244', 'http://example.org/telecom#isusedfor', 'http://example.org/telecom#MetricsEndpointsEvery10Seconds')]
Wrote mapping debug report to debug_class_mapping.csv


## 8) Build Turtle & Load into AuraDB

In [22]:
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from neo4j import GraphDatabase
from rdflib import URIRef as RDFURIRef
from tqdm import tqdm
from pathlib import Path

g = Graph()
BASE = Namespace(BASE_NS)
PROV = Namespace(PROV_NS)
g.bind("base", BASE)
g.bind("prov", PROV)

# GOOD: percent-encodes spaces, parentheses, etc.
source_uri = URIRef(Path(PDF_PATH).resolve().as_uri())

for page, s_uri, p_iri, o_uri in normalized:
    s = URIRef(s_uri); p = URIRef(p_iri); o = URIRef(o_uri)
    g.add((s, p, o))
    bn = BNode()
    g.add((bn, PROV.wasDerivedFrom, source_uri))
    g.add((bn, PROV.value, Literal(int(page))))
    g.add((bn, PROV.wasGeneratedBy, URIRef(p_iri)))

TTL_PATH = Path("./output.ttl")
g.serialize(destination=str(TTL_PATH), format="turtle")
print("Wrote:", TTL_PATH.resolve())

def try_n10s_import(uri, user, password, ttl_text):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as sess:
        try:
            _ = sess.run("CALL n10s.graphconfig.show()").data()
            res = sess.run("CALL n10s.rdf.import.inline($ttl, 'Turtle') YIELD terminationStatus, triplesLoaded, triplesParsed",
                           ttl=ttl_text).data()
            return True, res
        except Exception as e:
            return False, str(e)
        finally:
            driver.close()

def sanitize_rel_type(iri_tail: str) -> str:
    out = "".join(ch if ch.isalnum() else "_" for ch in iri_tail)
    if not out: out = "RELATED_TO"
    if out[0].isdigit(): out = "_" + out
    return out.upper()

def iri_tail(iri: str) -> str:
    for sep in ["#", "/", ":"]:
        if sep in iri:
            iri = iri.rsplit(sep, 1)[-1]
    return iri

def fallback_merge(uri, user, password, ttl_path):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    gg = Graph(); gg.parse(ttl_path, format="turtle")
    triples = [(str(s), str(p), str(o)) for (s, p, o) in gg if isinstance(s, RDFURIRef) and isinstance(o, RDFURIRef)]
    with driver.session() as sess:
        sess.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Resource) REQUIRE n.uri IS UNIQUE")
        for s, p, o in tqdm(triples):
            rel_type = sanitize_rel_type(iri_tail(p))
            q = "MERGE (s:Resource {uri:$suri})\n" +                 "MERGE (o:Resource {uri:$ouri})\n" +                 f"MERGE (s)-[r:{rel_type}]->(o)"
            sess.run(q, suri=s, ouri=o)
    driver.close()

ttl_text = Path(TTL_PATH).read_text(encoding="utf-8")
ok, info = try_n10s_import(NEO4J_URI, NEO4J_USER, NEO4J_PASS, ttl_text)
if ok:
    print("n10s import finished:", info)
else:
    print("n10s not available or failed; falling back. Details:", str(info)[:800])
    fallback_merge(NEO4J_URI, NEO4J_USER, NEO4J_PASS, TTL_PATH)
    print("Fallback MERGE completed.")

Wrote: /content/output.ttl
n10s not available or failed; falling back. Details: {code: Neo.ClientError.Procedure.ProcedureNotFound} {message: There is no procedure with the name `n10s.graphconfig.show` registered for this database instance. Please ensure you've spelled the procedure name correctly and that the procedure is properly deployed.}


100%|██████████| 1446/1446 [03:40<00:00,  6.56it/s]

Fallback MERGE completed.



