# Mini Materials Knowledge Graph — Common Semiconductors
a tiny pipeline that turns a small semiconductors table into **RDF triples**, then I query it using **SPARQL**. Later, down the process I plug in an NLP Algorithm to parse and query from natural language.


## Some basic terminologies
- **Ontology** → my small schema + vocabulary for this domain (classes + relations).
- **RDF (Resource Description Framework) triple** → one fact written as `subject — predicate — object`.
- **Namespace** → URL prefix so my identifiers are unique.
- **SPARQL** → my query tool for RDF graphs (like SQL but for triples).
- **Turtle (.ttl)** → compact text format to store RDF.
- **IRI(Internationalized Resource Identifier)** → a **Unicode string** that serves as a unique, global name or identifier for an entity.
- **Regex (Regular Expression):** -> A specialized mini-language for describing text by using combination of 'Literal' & 'Meta' characters & allowing  find, match, replace, and validate strings of text by specifying rules for character combinations. Literal characters: These match themselves exactly ("cat" would match the literal string "cat"). Metacharacters: special characters having a specific meaning within the regex pattern, such as:
(dot '.'): Matches any single character. 
(asterisk '*'): Matches the preceding character zero or more times. 
(plus '+'): Matches the preceding character one or more times. 
(question mark '?'): Matches the preceding character zero or one time. 
(pipe '|'): Acts as an "OR" operator, allowing you to match one pattern or another. 
\d: Matches a digit (0-9). 

**Imports**

In [None]:
# import os; print("OK" if os.getenv("OPENAI_API_KEY") else "MISSING")

OK


In [1]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pathlib import Path
import re
from dotenv import load_dotenv
load_dotenv()
import matplotlib.pyplot as plt
import json, ast
from matminer.featurizers.structure import GlobalSymmetryFeatures
from pymatgen.core import Structure
import openpyxl
from graphviz import Digraph

# Dataset prep. for KG   
The dataset I am using is parsed from Materials Project database, where 'Structure' is pymatgen’s pretty-printed “Structure summary”, not a standard format (CIF/POSCAR/JSON). Therefore 'Regex' helpers was used to decode that & subsequently a Structure object is created. The structure object was created to use 'Matminer's Structure featurizer, from which I am interested to extract the 'crystal_system' & 'Whether the structure has an inversion center'.

In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from pymatgen.core import Structure, Lattice
from matminer.featurizers.structure import GlobalSymmetryFeatures

# -------- Paths ----------
DATA_DIR   = Path("../data").resolve()
INPUT_XLSX = DATA_DIR / "full_dataset_Bandgap_0_to_5.xlsx"
OUTPUT_CSV = DATA_DIR / "full_dataset_Bandgap_0_to_5_featurized.csv"

# -------- Regex helpers for the pretty string  (see regex documentation)----------
re_abc     = re.compile(r"abc\s*:\s*([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)", re.I)

re_angles  = re.compile(r"angles\s*:\s*([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)", re.I)

re_pbc     = re.compile(r"pbc\s*:\s*(True|False)\s+(True|False)\s+(True|False)", re.I)

re_sites_header = re.compile(r"Sites\s*\(\d+\)", re.I)

re_site_row = re.compile(
    r"^\s*\d+\s+([A-Za-z][a-z]?)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)",
    re.I
)

def parse_pretty_structure(txt):
    """Parse pymatgen's pretty-printed Structure summary into a Structure."""
    if not isinstance(txt, str):
        return None
    s = txt.strip()

    # Lattice params
    m_abc = re_abc.search(s)
    m_ang = re_angles.search(s)
    if not (m_abc and m_ang):
        return None
    a, b, c = map(float, m_abc.groups())
    alpha, beta, gamma = map(float, m_ang.groups())

    # PBC (optional; default True,True,True)
    m_pbc = re_pbc.search(s)
    pbc = tuple(map(lambda x: x.lower()=="true", m_pbc.groups())) if m_pbc else (True, True, True)

    # Sites block: find header, then parse subsequent lines
    lines = s.splitlines()
    try:
        start_idx = next(i for i, ln in enumerate(lines) if re_sites_header.search(ln))
    except StopIteration:
        return None

    species, frac_coords = [], []
    for ln in lines[start_idx+1:]:
        m = re_site_row.match(ln)
        if not m:
            continue
        sp, fa, fb, fc = m.groups()
        species.append(sp)
        frac_coords.append([float(fa), float(fb), float(fc)])

    if not species:
        return None

    # Build lattice & structure
    lat = Lattice.from_parameters(a=a, b=b, c=c, alpha=alpha, beta=beta, gamma=gamma)
    try:
        # pymatgen Structure ignores PBC per-axis in initializer; periodicity is assumed.
        struct = Structure(lattice=lat, species=species, coords=frac_coords, coords_are_cartesian=False)
        return struct
    except Exception:
        return None

# -------- Load Excel ----------
df = pd.read_excel(INPUT_XLSX)

# Choose/confirm structure column 
struct_col = "structure" if "structure" in df.columns else None
if struct_col is None:
    # try a best-guess search for a column that contains the pretty summary
    candidates = [c for c in df.columns if df[c].astype(str).str.contains(r"Full Formula|Reduced Formula|Sites \(", regex=True, na=False).any()]
    struct_col = candidates[0] if candidates else None

if struct_col is None:
    raise ValueError(f"Could not find the structure column. Available columns: {list(df.columns)}")

# -------- Parse structures --------
parsed = df[struct_col].apply(parse_pretty_structure)

parsed_ok = parsed.apply(lambda x: isinstance(x, Structure))
print(f"Parsed {parsed_ok.sum()} / {len(parsed)} structures ({100*parsed_ok.mean():.1f}%).")

if parsed_ok.sum() == 0:
    raise RuntimeError("Parser could not reconstruct any structures from the pretty string format. "
                       "Please share one exact cell (as plain text) or consider storing CIF/JSON for structures.")

# Replace column with parsed Structure objects
df[struct_col] = parsed

# -------- Featurize (two features only) --------
gsf = GlobalSymmetryFeatures()
labels = gsf.feature_labels()  # ['spacegroup_num','crystal_system','crystal_system_int','is_centrosymmetric','n_symmetry_ops']

records = []
for s in df[struct_col]:
    if isinstance(s, Structure):
        try:
            vals = gsf.featurize(s)
            rec = dict(zip(labels, vals))
        except Exception:
            rec = {lbl: np.nan for lbl in labels}
    else:
        rec = {lbl: np.nan for lbl in labels}
    records.append(rec)

feat_df = pd.DataFrame(records)
out_df = pd.concat([df.reset_index(drop=True),
                    feat_df[["crystal_system", "is_centrosymmetric"]].reset_index(drop=True)], axis=1)

# -------- Save --------
out_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved: {OUTPUT_CSV}")
print(out_df[["crystal_system", "is_centrosymmetric"]].isna().mean())
out_df.head()

Parsed 150987 / 150987 structures (100.0%).
Saved: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5_featurized.csv
crystal_system        0.0
is_centrosymmetric    0.0
dtype: float64


Unnamed: 0,material_id,formula,band_gap,structure,crystal_system,is_centrosymmetric
0,mp-10018,Ac,0.0,[[0. 0. 0.] Ac],cubic,True
1,mp-1183057,Ac,0.0,[[1.31096178e+00 2.27065255e+00 3.21093059e-16...,trigonal,False
2,mp-1183069,Ac,0.0,"[[0. 0. 0.] Ac, [2.65892229 0.77478234 2.89079...",trigonal,True
3,mp-862690,Ac,0.0,"[[0. 0. 0.] Ac, [-3.26165376e-07 2.33598734e+...",hexagonal,True
4,mp-861724,Ac2AgIr,0.0,"[[3.10657746 2.19668199 5.38075 ] Ac, [0. 0....",cubic,True


## 1) Setup
read a tiny CSV and map it into RDF using `rdflib`.


In [2]:
DATA    = Path("../data/full_dataset_Bandgap_0_to_5_featurized.csv")  # input CSV
TTL_OUT = Path("../data/full_dataset_Bandgap_0_to_5.ttl")             # RDF Turtle output

print("Using data:", DATA.resolve())
print("Will write TTL:", TTL_OUT.resolve())

Using data: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5_featurized.csv
Will write TTL: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl



## 2) Ontology skeleton 
a few classes and properties that I actually need for the initial CSV i am using.  
Classes: `Material`, `SynthesisMethod`, `CrystalStructure`, `Property`  
Properties:  
- data: `hasBandGap` (float eV), `hasLatticeConstant` (float Å)  
- object: `hasCrystalStructure`, `synthesizedBy`

- the final dataset will have: structure (crystal system and inversion center), composition and bandgap.  

3 class declarations (Material, CrystalStructure, Property) → 3  
4 property declarations (hasBandGap, hasFormula, hasExternalId, hasCrystalStructure) → 5  
5×(domain + range) annotations → 10 for each prperty    
Total = 3 + 5 + 10 = 18 triples.

a. Core structure

In [3]:
# Ontology skeleton for: material_id, formula, band_gap, crystal_system, is_centrosymmetric
# (structure captured via crystal system + inversion center only)

from rdflib import Graph, Namespace, RDF, RDFS, XSD

g = Graph()

# Namespaces
EX = Namespace("http://example.org/mse#")
g.bind("ex", EX)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)

# Classes
Material         = EX.Material
CrystalStructure = EX.CrystalStructure
Property         = EX.Property  # generic placeholder

for cls in [Material, CrystalStructure, Property]:
    g.add((cls, RDF.type, RDFS.Class))

# Datatype properties (aligned to featurized CSV)
hasExternalId     = EX.hasExternalId        # -> material_id
hasFormula        = EX.hasFormula           # -> formula (composition string)
hasBandGap        = EX.hasBandGap           # -> band_gap (eV)
hasCrystalSystem  = EX.hasCrystalSystem     # -> crystal_system (e.g., cubic, hexagonal)
hasCentrosymmetric= EX.hasCentrosymmetric   # -> is_centrosymmetric (True/False)

for prop in [hasExternalId, hasFormula, hasBandGap, hasCrystalSystem, hasCentrosymmetric]:
    g.add((prop, RDF.type, RDF.Property))

# Domain/Range annotations
g.add((hasExternalId,      RDFS.domain, Material)); g.add((hasExternalId,      RDFS.range, XSD.string))
g.add((hasFormula,         RDFS.domain, Material)); g.add((hasFormula,         RDFS.range, XSD.string))
g.add((hasBandGap,         RDFS.domain, Material)); g.add((hasBandGap,         RDFS.range, XSD.float))
g.add((hasCrystalSystem,   RDFS.domain, Material)); g.add((hasCrystalSystem,   RDFS.range, XSD.string))
g.add((hasCentrosymmetric, RDFS.domain, Material)); g.add((hasCentrosymmetric, RDFS.range, XSD.boolean))

print("Ontology initialized (structure via crystal_system + inversion center, plus composition & bandgap).")
print("Triples so far:", len(g))

Ontology initialized (structure via crystal_system + inversion center, plus composition & bandgap).
Triples so far: 18


b. Ingest metadata

In [4]:
from rdflib import Literal

ingestIndex = EX.ingestIndex        # integer sequence number of ingest
ingestTime  = EX.ingestTime         # xsd:dateTime when ingested

for prop, rng, comment in [
    (ingestIndex, XSD.integer,  "Monotonic ingest sequence number for this material record."),
    (ingestTime,  XSD.dateTime, "Timestamp when this record was ingested into the KG (ISO 8601)."),
]:
    if (prop, RDF.type, RDF.Property) not in g:
        g.add((prop, RDF.type,   RDF.Property))
        g.add((prop, RDFS.domain, EX.Material))
        g.add((prop, RDFS.range,  rng))
        
        # comment must be an RDF term (Literal), not a Python str
        g.add((prop, RDFS.comment, Literal(comment)))

print("Ingest metadata properties declared (idempotent).")
print("Triples so far:", len(g))


Ingest metadata properties declared (idempotent).
Triples so far: 26


c. Automatic Ingest_Index & Ingest_time

In [5]:
from rdflib import Literal
from rdflib.namespace import XSD
from datetime import datetime, timezone
try:
    # Py 3.11+: datetime.UTC exists
    from datetime import UTC
except ImportError:
    UTC = timezone.utc

def iso_now():
    # Always UTC, no microseconds, with 'Z' suffix
    return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")

def next_ingest_index(graph):
    existing = []
    for s, p, o in graph.triples((None, EX.ingestIndex, None)):
        try:
            existing.append(int(o.toPython()))
        except Exception:
            pass
    return (max(existing) + 1) if existing else 1

def material_iri(material_id: str):
    safe_id = str(material_id).strip().replace(" ", "_").replace("/", "_")
    return EX[f"material/{safe_id}"]

def add_material_row(row, graph, ingest_idx=None, when_iso=None):
    m = material_iri(row["material_id"])
    graph.add((m, RDF.type, EX.Material))

    # Core fields
    graph.add((m, EX.hasExternalId,      Literal(str(row["material_id"]))))
    graph.add((m, EX.hasFormula,         Literal(str(row["formula"]))))
    graph.add((m, EX.hasBandGap,         Literal(float(row["band_gap"]))))
    graph.add((m, EX.hasCrystalSystem,   Literal(str(row["crystal_system"]))))
    graph.add((m, EX.hasCentrosymmetric, Literal(bool(row["is_centrosymmetric"]))))

    # Ingest metadata (auto)
    if ingest_idx is None:
        ingest_idx = next_ingest_index(graph)
    if when_iso is None:
        when_iso = iso_now()

    graph.add((m, EX.ingestIndex, Literal(int(ingest_idx), datatype=XSD.integer)))

    # Use set() to ensure a single value; avoids deprecated utcnow()
    graph.set((m, EX.ingestTime, Literal(when_iso, datatype=XSD.dateTime)))

    return m

## 3) Load CSV and mint entities  
create IRIs (Internationalized Resource Identifier) from labels (simple normalization) and assert triples for each row.

In [None]:
# Featurized CSV columns: material_id, formula, band_gap, crystal_system, is_centrosymmetric
import pandas as pd, re
from rdflib import Literal, RDF, RDFS, XSD, URIRef

# load
df_raw = pd.read_csv(DATA)

# rename to stable names (will be treated as gloabal)
df = df_raw.rename(columns={
    "material_id":        "material_id",
    "formula":            "formula",
    "band_gap":           "band_gap_eV",
    "crystal_system":     "crystal_system",
    "is_centrosymmetric": "is_centrosymmetric",
}).copy()



# enforcing dtypes 
for col in ["formula", "material_id", "crystal_system"]:
    if col in df.columns:
        df[col] = df[col].astype("string")

df["band_gap_eV"] = pd.to_numeric(df.get("band_gap_eV"), errors="coerce")

if "is_centrosymmetric" in df.columns:
    # normalize 
    df["is_centrosymmetric"] = df["is_centrosymmetric"].map(
        lambda x: bool(int(x)) if str(x).strip() in {"1","0"} else
                  (str(x).strip().lower() == "true") if pd.notna(x) else None
    )

# --- 2) Helpers (consistent across the notebook) ---
def _slugify(text: str) -> str:
    text = str(text).strip().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
    text = re.sub(r"[^A-Za-z0-9_]", "_", text)
    return text

def mint_entity(label, cls: URIRef, fallback_prefix: str, idx: int):
    if label is None or (pd.isna(label) if hasattr(pd, "isna") else label is None) or str(label).strip() == "":
        safe = f"{fallback_prefix}_{idx}"
        iri  = EX[safe]
        g.add((iri, RDF.type, cls))
        g.add((iri, RDFS.label, Literal(f"{fallback_prefix} #{idx}")))
        return iri
    label_str = str(label)
    safe = _slugify(label_str)
    iri  = EX[safe]
    g.add((iri, RDF.type, cls))
    g.add((iri, RDFS.label, Literal(label_str)))
    return iri

print("Data loaded. Rows:", len(df))
print("Columns:", list(df.columns))

Data loaded. Rows: 150987
Columns: ['material_id', 'formula', 'band_gap_eV', 'structure', 'crystal_system', 'is_centrosymmetric']


## 4) Serialize to Turtle
write the RDF graph to a `.ttl` file so it’s versionable in Git and easy to inspect.


In [8]:
# --- Serialize graph to Turtle ---encoding="utf-8" ensures rdflib returns bytes --

ttl_bytes = g.serialize(format="turtle", encoding="utf-8")
TTL_OUT.write_bytes(ttl_bytes)

# quick sanity check
p = TTL_OUT.resolve()
print("Wrote:", p)
print("Triples in graph:", len(g))
print("File size (bytes):", p.stat().st_size)

Wrote: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl
Triples in graph: 26
File size (bytes): 1117


## 6) Creating safeguard for possible problems in data scrapping!!  
lets have a few small rules here to catch obvious issues (labels missing, negative band gaps, etc.).

In [9]:
problems = []

# A) All Materials should have labels
for s in g.subjects(RDF.type, EX.Material):
    if not any(True for _ in g.objects(s, RDFS.label)):
        problems.append(f"Material without label: {s}")

# B) Band gap must be numeric and non-negative
for s, p, o in g.triples((None, EX.hasBandGap, None)):
    try:
        val = float(o.toPython())
        if val < 0:
            problems.append(f"Negative band gap for {s}")
    except Exception:
        problems.append(f"Non-numeric band gap for {s}: {o}")

print("No obvious problems ✅" if not problems else "Consistency problems:")
for x in problems:
    print("-", x)

No obvious problems ✅



## 7) Placeholder for LLM‑assisted extraction
When I replace this stub with a real LLM/NLP call, I’ll feed abstracts/tables and get back candidate triples to add to the graph. This cell is just a demonstration of tideousness one has to encounter for manually adding to the KG :xD.


In [10]:
def propose_triples_from_text(text: str):
    """
    Demo placeholder:
    Pretend we parsed that 'GaN has Eg ~3.4 eV'.
    Later this can be swapped for NLP/LLM-based extraction.
    """
    return [(EX.GaN, EX.hasBandGap, Literal(3.4, datatype=XSD.float))]

# Insert demo triples
for s, p, o in propose_triples_from_text("GaN has band gap ~3.4 eV"):
    g.add((s, p, o))

print("Triples after stub insert:", len(g))

# --- Update the TTL file with new triples ---
TTL_OUT.write_bytes(g.serialize(format="turtle", encoding="utf-8"))

print("Updated:", TTL_OUT.resolve())
print("Triples now in graph:", len(g))

Triples after stub insert: 27
Updated: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl
Triples now in graph: 27


## 9: Plotting a knowledge graph

- via `HTML`

In [None]:
# # Lightweight KG visualization (Material, CrystalStructure, bandgap, crystal_system, is_centrosymmetric)
# from pyvis.network import Network
# from rdflib import Graph, URIRef, BNode, Literal, RDF, RDFS

# def display_label(term):
#     """Prefer rdfs:label, then QName/localname, else str(term)."""
#     lab = g.value(term, RDFS.label)
#     if lab:
#         return str(lab).strip()
#     if isinstance(term, URIRef):
#         try:
#             return g.namespace_manager.normalizeUri(term)
#         except Exception:
#             pass
#         s = str(term)
#         return s.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
#     return str(term).strip()

# def node_id(term):
#     """Stable ID for pyvis."""
#     if isinstance(term, (URIRef, BNode)):
#         return str(term)
#     return f"lit:{hash((str(term), type(term).__name__))}"

# def node_style(term):
#     """Color by class (Material / CrystalStructure); literals boxed."""
#     types = set(g.objects(term, RDF.type)) if isinstance(term, (URIRef, BNode)) else set()

#     def has_type(local):
#         return any(str(t).endswith(f"#{local}") or str(t).endswith(f"/{local}") for t in types)

#     if has_type("Material"):
#         return dict(color="#2b8a3e", shape="ellipse")
#     if has_type("CrystalStructure"):
#         return dict(color="#1c7ed6", shape="ellipse")
#     if isinstance(term, Literal):
#         return dict(color="#bfbfbf", shape="box")
#     return dict(color="#666666", shape="ellipse")

# def visualize_graph_pyvis(g, max_edges=2000, show_literals=True, height="700px", out_html="kg.html"):
#     net = Network(height=height, width="100%", directed=True, notebook=True, cdn_resources="in_line")
#     net.toggle_physics(True)

#     added = set()
#     edge_count = 0

#     for s, p, o in g.triples((None, None, None)):
#         if edge_count >= max_edges:
#             break

#         # Optionally skip literals
#         if not show_literals and isinstance(o, Literal):
#             continue

#         sid = node_id(s)
#         oid = node_id(o)

#         if sid not in added:
#             net.add_node(sid, label=display_label(s), **node_style(s))
#             added.add(sid)
#         if show_literals or not isinstance(o, Literal):
#             if oid not in added:
#                 net.add_node(oid, label=display_label(o), **node_style(o))
#                 added.add(oid)

#         # Shorten common predicate labels for readability
#         pred_label = display_label(p)
#         pred_label = pred_label.replace("ex:", "")
#         pred_label = pred_label.replace("hasCrystalSystem", "crystal_system")
#         pred_label = pred_label.replace("hasCentrosymmetric", "centrosymmetric")
#         pred_label = pred_label.replace("hasBandGap", "band_gap_eV")
#         pred_label = pred_label.replace("hasFormula", "formula")
#         pred_label = pred_label.replace("hasExternalId", "material_id")

#         net.add_edge(sid, oid, label=pred_label)
#         edge_count += 1

#     net.show(out_html)
#     print(f"Wrote: {out_html} (open in your browser)")

# visualize_graph_pyvis(g, max_edges=1000, show_literals=True, out_html="kg.html")

kg.html
Wrote: kg.html (open in your browser)


- Via `Graphviz`

In [None]:
# import os
# os.environ["PATH"] += os.pathsep + r"C:\Program Files\Graphviz\bin" 
# from graphviz import Digraph
# import hashlib

# def nlabel(t):
#     lab = g.value(t, RDFS.label)
#     if lab: return str(lab)
#     if isinstance(t, URIRef):
#         try: return g.namespace_manager.normalizeUri(t)
#         except: pass
#     return str(t)

# def nid(t):
#     return hashlib.md5(str(t).encode("utf-8")).hexdigest()  # safe ID for DOT

# def nshape(t):
#     return "box" if isinstance(t, Literal) else "ellipse"

# def ncolor(t):
#     types = set(g.objects(t, RDF.type)) if isinstance(t, URIRef) else set()
#     def has(local): return any(str(tt).endswith(f"#{local}") or str(tt).endswith(f"/{local}") for tt in types)
#     if has("Material"): return "#2b8a3e"
#     if has("CrystalStructure"): return "#1c7ed6"
#     return "#666666"

# dot = Digraph(engine="dot")  # or "sfdp" for large graphs
# dot.attr(rankdir="LR")

# seen = set()
# for s, p, o in g.triples((None, None, None)):
#     sid, oid = nid(s), nid(o)
#     if sid not in seen:
#         dot.node(sid, label=nlabel(s), shape=nshape(s), color=ncolor(s)); seen.add(sid)
#     if oid not in seen:
#         dot.node(oid, label=nlabel(o), shape=nshape(o), color=ncolor(o)); seen.add(oid)
#     dot.edge(sid, oid, label=nlabel(p).replace("ex:", ""))

# dot.render("kg_graphviz", format="png", cleanup=True)
# print("Wrote: kg_graphviz.png")

Wrote: kg_graphviz.png


# Utilizing `LLM`  in `KG pipeline`  
The KG is like a well-organized library: every material is a “book,” and the properties (band gap, crystal system, centrosymmetry, formula) are the “catalog cards.”  

The LLM is like a librarian who understands natural language: I can ask, “Which semiconductors are cubic and non-centrosymmetric with band gap > 2 eV?” and the LLM can translate that into graph queries or even propose new triples. Without the LLM, I’d have to speak in SPARQL (machine query language). With the LLM, I can speak in plain English, and it reformulates my request into the right graph operations.  

Another role: the LLM can ingest external text (papers, reports) and suggest new triples to insert, like the librarian reading new books and updating the catalog automatically.  

`Net effect:`
KG = structured factual memory (precise, but rigid). LLM = flexible reasoning & translation layer (imprecise, but good at language). Together = I get both rigor and flexibility.


*Quick smoke test*
-Chat GPT: doesnt support api access for plus user.

In [15]:
# from openai import OpenAI
# import os, json

# assert os.getenv("OPENAI_API_KEY"), "API key missing"
# client = OpenAI()

# resp = client.chat.completions.create(
#     model="gpt-4o-mini",
#     response_format={"type": "json_object"},   # ← JSON mode
#     messages=[
#         {"role":"system","content":"Return valid JSON only."},
#         {"role":"user","content":"Respond with {\"ok\": true}"}
#     ],
#     temperature=0
# )
# print(json.loads(resp.choices[0].message.content))

*Quick smoke test*  
-OLLAMA offers llms to be run on local machines and an API.

In [16]:
# import json, ollama
# r = ollama.chat(
#     model='llama3.2:3b',
#     messages=[{"role":"system","content":"Return ONLY valid JSON."},
#               {"role":"user","content":'{"ok": true}'}],
#     format='json',
#     options={'temperature': 0}
# )
# print(json.loads(r['message']['content']))

**Wiring 'ollama'into the pipeline.**  
a. `ingestion` via OLLAMA  
1. Schema

In [11]:
# Pydantic schema for LLM ↔ KG handoff (natural language → structured facts)
from pydantic import BaseModel, Field
from typing import Optional

class RowOut(BaseModel):
    # Human-readable material label (e.g., formula); becomes rdfs:label on EX.Material
    material: str

    # Columns we actually model in this project
    formula: Optional[str] = None                 # -> ex:hasFormula
    material_id: Optional[str] = None             # -> ex:hasExternalId
    crystal_system: Optional[str] = None          # -> ex:hasCrystalSystem
    is_centrosymmetric: Optional[bool] = None     # -> ex:hasCentrosymmetric

    # Numeric property (kept), no synthesis/lattice here
    band_gap_eV: Optional[float] = Field(default=None, ge=0)  # -> ex:hasBandGap (eV)

2. Building 'hints' from data

In [12]:
# (LLM role: use these sets to validate/normalize outputs before making triples)

allowed_crystal_systems = sorted(set(str(x).strip() for x in df['crystal_system'].dropna()))
allowed_centrosym       = sorted(set(bool(x) for x in df['is_centrosymmetric'].dropna()))

## aliases for my old schema:
# allowed_structs = allowed_crystal_systems  
# allowed_methods = []    
                   
print("crystal_systems:", allowed_crystal_systems)
print("centrosymmetric values:", allowed_centrosym)

crystal_systems: ['cubic', 'hexagonal', 'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic', 'trigonal']
centrosymmetric values: [False, True]


3. Normalization of data-row (that're gonna be parsed) with 'ollama'.  
`Pydantic:` Python's most popular data validation library that can turn type hints into runtime validation rules. Instead of writing dozens of if isinstance() checks and custom validation functions, you define your data structure once using familiar Python syntax

In [13]:
# STRICT normalizer: requires ALL fields; supports raw text or a URL.
# If any required field is missing => raises ValueError("Text not informative enough...").

import json, ollama, re
from pydantic import ValidationError
from typing import Optional
import requests
from bs4 import BeautifulSoup
import hashlib

OLLAMA_MODEL_CANDIDATES = [
    'llama3.2:1b', 'llama3.2:1b-instruct',
    'llama3.2:3b-instruct-q4_0', 'llama3.2:3b-q4_0', 'llama3.2:3b'
]
OLLAMA_OPTIONS = {'temperature': 0, 'num_ctx': 1024, 'num_batch': 16}

def _available_ollama_models():
    try:
        info = ollama.list()
        return {m['model'] for m in info.get('models', [])}
    except Exception:
        return set()

def _pick_model():
    have = _available_ollama_models()
    for m in OLLAMA_MODEL_CANDIDATES:
        if m in have:
            try:
                ollama.chat(model=m, messages=[{"role":"user","content":"ping"}],
                            options={'num_ctx':128,'temperature':0})
                return m
            except Exception:
                continue
    raise RuntimeError(
        "No suitable local Ollama model found. Pull one of:\n  "
        + "\n  ".join(OLLAMA_MODEL_CANDIDATES)
        + "\nExample:\n  ollama pull llama3.2:1b"
    )

_MODEL = _pick_model()
print("Using Ollama model:", _MODEL)

STRICT_SYSTEM = f"""
You are a materials KG assistant.
Return ONLY JSON with keys EXACTLY:
material, formula, material_id, crystal_system, is_centrosymmetric, band_gap_eV.
Rules:
- Required: (material OR formula) AND crystal_system AND is_centrosymmetric AND band_gap_eV
- material_id is OPTIONAL .
- crystal_system ∈ {allowed_crystal_systems}
- is_centrosymmetric → boolean (true/false; accept yes/no/centro/non-centro)
- band_gap_eV → single float (eV)
- If required fields are missing, return: {{"error":"Text not informative enough to add in KG"}}
"""

def _to_bool(x):
    if x is None: return None
    s = str(x).strip().lower()
    if s in {"true","yes","y","1","t"}: return True
    if s in {"false","no","n","0","f"}: return False
    if "non" in s and "centro" in s: return False
    if "centro" in s: return True
    return None

def _fetch_url_text(url: str, max_chars: int = 50000) -> str:
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    for tag in soup(["script","style","nav","footer","header","noscript"]): tag.decompose()
    text = re.sub(r"\s+", " ", soup.get_text(" ").strip())
    return text[:max_chars]

def _fabricate_material_id(raw_text: str, formula: str | None, source_url: str | None) -> str:
    if source_url:
        return "url::" + hashlib.sha1(source_url.encode("utf-8")).hexdigest()[:12]
    base = (formula or "") + "|" + raw_text[:2048]
    return "text::" + hashlib.sha1(base.encode("utf-8")).hexdigest()[:12]

def _extract_material_id_from_text(text: str) -> str|None:
    import re
    m = re.search(r'\bmaterial[s]?_id\s*:\s*([^\s;,\)]+)', text, re.I)
    return m.group(1).strip() if m else None

def normalize_row_with_ollama(input_obj: dict, model: str|None=None) -> RowOut:
    
    # accept plain strings too
    if isinstance(input_obj, str):
        input_obj = {"text": input_obj}

    model = model or _MODEL
    source_url = input_obj.get("url")
    raw_text = _fetch_url_text(source_url) if source_url else str(input_obj.get("text",""))
    payload  = {"text": raw_text, "source_url": source_url} if source_url else input_obj

    # try the chosen model once; if it 404s, pick another available; do not loop forever
    tried = set()
    while True:
        try:
            resp = ollama.chat(
                model=model,
                messages=[{"role":"system","content": STRICT_SYSTEM},
                          {"role":"user","content": f"Extract to schema from: {json.dumps(payload)[:49000]}"}],
                format='json',
                options=OLLAMA_OPTIONS
            )
            data = json.loads(resp['message']['content'])
        except Exception as e:
            tried.add(model)
            # choose another available model
            avail = [m for m in _available_ollama_models() if m not in tried]
            model = next((m for m in OLLAMA_MODEL_CANDIDATES if m in avail), None)
            if not model:
                raise RuntimeError("Ollama call failed and no viable model remains.") from e
            continue


        if isinstance(data, dict) and data.get("error"):
            raise ValueError("Text not informative enough to add in KG")

        data["is_centrosymmetric"] = _to_bool(data.get("is_centrosymmetric"))
        # validate requireds
        material_ok = bool(str(data.get("material","")).strip())
        formula_ok  = bool(str(data.get("formula","")).strip())
        cs_ok       = bool(str(data.get("crystal_system","")).strip())
        centro_ok   = data.get("is_centrosymmetric") in (True, False)
        try:
            data["band_gap_eV"] = float(data["band_gap_eV"])
            bg_ok = True
        except Exception:
            bg_ok = False

        if not ((material_ok or formula_ok) and cs_ok and centro_ok and bg_ok):
            raise ValueError("Text not informative enough to add in KG")
        
        # If caller provided {"text": "..."} use it for id extraction / fabrication
        raw_text = raw_text or source_url.get("text") if isinstance(raw_text or source_url, dict) else None
        source_url = raw_text or source_url.get("url") if isinstance(raw_text or source_url, dict) else None
        
        hinted_id = _extract_material_id_from_text(raw_text or source_url or "")
        if hinted_id:
            data["material_id"] = hinted_id

        # fabricate if still missing
        if not str(data.get("material_id", "")).strip():
            data["material_id"] = _fabricate_material_id(raw_text or source_url or "", data.get("formula"), source_url)

        return RowOut(**data)

Using Ollama model: llama3.2:3b


4. **Appending normalized result into my graph.**  
*safeguard for dedupe, provenance, and idempotency are wired.*    
- add two provenance props (ex:statedIn, ex:hasProvenanceId) to the ontology.  
- build in-memory indices of existing materials by material_id and by formula (for dedupe).  
- add_once(s,p,o): guard so the same triple isn’t added twice.  
- get_or_create_material(...): reuse an existing material by material_id (first) or formula (fallback); otherwise mint a new one and register it in the indices.   
- small cache for provenance nodes so the same (label,id) source isn’t duplicated.  
- ingest_normalized_row(...): idempotently attach hasFormula, hasBandGap, hasCrystalSystem, hasCentrosymmetric; optionally link a provenance node via ex:statedIn; return the material IRI.

In [14]:
# LLM → KG: ingest a normalized row (idempotent, dedupe, provenance)

from rdflib import Literal, XSD, BNode, RDF, RDFS
from typing import Optional  # For python 3.10 users: replace str|None with Optional[str]

# --- 0) Ontology add-ons (once) ---
statedIn        = EX.statedIn
hasProvenanceId = EX.hasProvenanceId
for prop in [statedIn, hasProvenanceId]:
    g.add((prop, RDF.type, RDF.Property))
    g.add((prop, RDFS.domain, RDFS.Resource))
    g.add((prop, RDFS.range,  RDFS.Resource))

# --- 1) Fast lookup indices for dedupe by material_id / formula ---
def _index_materials():
    by_formula, by_id = {}, {}
    for m in g.subjects(RDF.type, Material):
        for f in g.objects(m, hasFormula):
            by_formula[str(f)] = m
        mid = g.value(m, hasExternalId)
        if mid:
            by_id[str(mid)] = m
    return by_formula, by_id

MAT_BY_FORMULA, MAT_BY_ID = _index_materials()

def _mint_material_iri(label: str|None, formula: str|None, idx: int):
    if formula and str(formula).strip():
        safe = _slugify(str(formula))
        iri  = EX[safe]
        g.add((iri, RDF.type, Material))
        g.add((iri, RDFS.label, Literal(str(formula))))
        return iri
    # fallback to generic minting with label or placeholder
    return mint_entity(label, Material, "Material", idx)


# --- 2) Idempotent triple adder ---
def add_once(s, p, o):
    if (s, p, o) not in g:
        g.add((s, p, o))

# --- 3) Get-or-create material with dedupe rules ---
def get_or_create_material(material_id: str|None,
                           formula: str|None,
                           label: str|None,
                           idx: int):
    f = (str(formula).strip() if formula else None)
    mid = (str(material_id).strip() if material_id else None)

    # 1) If formula present, ALWAYS key by formula (do not merge via ID)
    if f:
        if f in MAT_BY_FORMULA:
            return MAT_BY_FORMULA[f]
        m = _mint_material_iri(label, f, idx)
        add_once(m, hasFormula, Literal(f, datatype=XSD.string))
        MAT_BY_FORMULA[f] = m
        if mid:
            add_once(m, hasExternalId, Literal(mid, datatype=XSD.string))
            MAT_BY_ID[mid] = m
        return m

    # 2) Only if formula missing, use material_id as fallback key
    if mid and mid in MAT_BY_ID:
        return MAT_BY_ID[mid]
    m = _mint_material_iri(label, None, idx)
    if mid:
        add_once(m, hasExternalId, Literal(mid, datatype=XSD.string))
        MAT_BY_ID[mid] = m
    return m


# --- 4) Provenance node helper with small cache to avoid duplicates ---
_SOURCE_CACHE = {}  # key: (label, id) -> BNode

def make_source_node(source_label: Optional[str] = None, source_id: Optional[str] = None):
    key = (source_label or "", source_id or "")
    if key in _SOURCE_CACHE:
        return _SOURCE_CACHE[key]
    src = BNode()
    if source_label:
        add_once(src, RDFS.label, Literal(source_label))
    if source_id:
        add_once(src, hasProvenanceId, Literal(source_id))
    _SOURCE_CACHE[key] = src
    return src

# --- 5) Ingest (idempotent + dedupe + provenance) ---
def tag_ingest_metadata(m, idx: int):
    """Attach ingestIndex & ingestTime once (idempotent)."""
    add_once(m, ingestIndex, Literal(int(idx), datatype=XSD.integer))
    add_once(m, ingestTime, Literal(datetime.now(timezone.utc).isoformat(), datatype=XSD.dateTime))


def _has_triple(s, p):
    return any(True for _ in g.triples((s, p, None)))

def ingest_normalized_row(nr: RowOut, idx: int = 0,
                          source_label: str | None = None,
                          source_id: str | None = None):
    m = get_or_create_material(
        material_id = getattr(nr, "material_id", None),
        formula     = getattr(nr, "formula", None),
        label       = nr.material,
        idx         = idx
    )

    # identifiers (formula already attached at creation if present)
    if getattr(nr, "material_id", None):
        add_once(m, hasExternalId, Literal(str(nr.material_id), datatype=XSD.string))

    # numeric
    if nr.band_gap_eV is not None:
        add_once(m, hasBandGap, Literal(float(nr.band_gap_eV), datatype=XSD.float))

    # structure facets: add if missing
    if getattr(nr, "crystal_system", None) and not _has_triple(m, hasCrystalSystem):
        add_once(m, hasCrystalSystem, Literal(str(nr.crystal_system), datatype=XSD.string))
    if getattr(nr, "is_centrosymmetric", None) is not None and not _has_triple(m, hasCentrosymmetric):
        add_once(m, hasCentrosymmetric, Literal(bool(nr.is_centrosymmetric), datatype=XSD.boolean))

    # provenance + ingest meta
    if source_label or source_id:
        src = make_source_node(source_label, source_id)
        add_once(m, statedIn, src)
    add_once(m, ingestTime,  Literal(datetime.now(timezone.utc).isoformat(), datatype=XSD.dateTime))
    add_once(m, ingestIndex, Literal(int(idx), datatype=XSD.integer))

    return m

4.1 **Batch URL ingest (strict) with rejection previews**

In [None]:
from textwrap import shorten

def batch_ingest_urls(urls, start_idx: int = 1_000_000, text_preview_chars: int = 320):
    successes, rejects = [], []

    for k, url in enumerate(urls, start=0):
        try:
            nr = normalize_row_with_ollama({"url": url})   # STRICT: may raise ValueError
            m  = ingest_normalized_row(nr, idx=start_idx+k,
                                       source_label="web_import", source_id=url)
            # ingest_normalized_row now auto-tags metadata
            successes.append({
                "url": url,
                "material": nr.material,
                "formula": nr.formula,
                "material_id": nr.material_id,
                "crystal_system": nr.crystal_system,
                "is_centrosymmetric": nr.is_centrosymmetric,
                "band_gap_eV": nr.band_gap_eV,
            })
        except ValueError:
            # Not informative enough → show a short preview
            try:
                preview = shorten(_fetch_url_text(url, max_chars=4000), width=text_preview_chars, placeholder=" …")
            except Exception as fe:
                preview = f"[failed to fetch text: {fe}]"
            rejects.append({
                "url": url,
                "reason": "Text not informative enough to add in KG",
                "text_preview": preview,
            })
        except Exception as e:
            rejects.append({
                "url": url,
                "reason": f"Error: {e}",
                "text_preview": None,
            })

    df_ok  = pd.DataFrame(successes)
    df_bad = pd.DataFrame(rejects)
    print(f"Ingested: {len(df_ok)} | Rejected: {len(df_bad)} | Triples now: {len(g)}")
    if len(df_ok):
        display(df_ok.head(min(10, len(df_ok))))
    if len(df_bad):
        print("\nRejected entries (showing up to 10):")
        display(df_bad.head(min(10, len(df_bad))))
    return df_ok, df_bad

# Example call 
# ok, bad = batch_ingest_urls(["https://…/paper1", "https://…/paper2"])

b. `Query via OLLAMA`  
**NL → SPARQL → rdflib query → DataFrame**

In [None]:
# === NL→SPARQL + Sanitizer + Extended Programmatic Fallback (ranges, equals, centro=true) ===
import re, pandas as pd, ollama
from pyparsing import ParseException as PyParsingParseException
from datetime import datetime

SPARQL_PREFIX = """PREFIX ex: <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
"""

NL2SPARQL_SYSTEM = f"""
Generate SPARQL SELECT for this ontology:

Class:
  ex:Material

Properties on ex:Material (all literals):
  ex:hasFormula (xsd:string)         -> ?formula
  ex:hasExternalId (xsd:string)
  ex:hasBandGap (xsd:float)          -> ?bandgap
  ex:hasCrystalSystem (xsd:string)   -> ?crystal_system
  ex:hasCentrosymmetric (xsd:boolean)-> ?centro

Rules (IMPORTANT):
- Output ONLY a SPARQL SELECT query.
- Include the PREFIX block exactly as given by the user.
- Always BIND variables before filtering them.
- Do NOT use 'NOT'. For non-centrosymmetric: ?m ex:hasCentrosymmetric ?centro . FILTER(?centro = false)
- If filtering band gap, ensure ?bandgap is bound (OPTIONAL if not central).
- Default projection (if unspecified): ?m ?label ?formula ?bandgap ?crystal_system ?centro ?source_label ?source_id
"""

def sanitize_sparql(q: str) -> str:
    import re

    q = q.strip()
    if q.startswith("```"):
        q = q.strip("`").split("\n", 1)[1].strip()

    PREFIX = SPARQL_PREFIX.strip() + "\n"
    if "PREFIX ex:" not in q:
        q = PREFIX + q

    # --- Collect WHERE-ish lines & tail (ORDER/LIMIT)
    lines = [ln.strip() for ln in q.splitlines() if ln.strip()]
    where_lines, tail_order = [], []
    for ln in lines:
        low = ln.lower()
        if low.startswith(("prefix", "select")):
            continue
        if low.startswith(("order by", "limit")):
            tail_order.append(ln)
            continue
        if low.startswith(("where {", "optional", "filter", "bind", "values", "}")) or "?m " in ln or " ex:" in ln or ln.startswith("?"):
            where_lines.append(ln)

    # --- Flatten WHERE body; remove stray braces; clean illegal patterns
    where_txt = "\n".join(where_lines)
    where_txt = where_txt.replace("WHERE {", "").replace("{", "").replace("}", "")
    # kill FILTER(BIND(...)) and self-BIND patterns
    where_txt = re.sub(r'FILTER\s*\(\s*BIND\s*\([^)]+\)[^)]*\)\s*\.?', '', where_txt, flags=re.IGNORECASE)
    where_txt = re.sub(r'BIND\s*\(\s*[a-zA-Z0-9_:]+\s*\(\s*\?([A-Za-z_]\w*)\s*\)\s*AS\s*\?\1\s*\)\s*\.?', '', where_txt)
    # fix centrosymmetry as string triple
    where_txt = re.sub(
        r'\?m\s+ex:hasCentrosymmetric\s+(?:"false"|\'false\'|false)\s*\.\s*',
        '?m ex:hasCentrosymmetric ?centro .\n'
        'FILTER( BOUND(?centro) && ((datatype(?centro)=xsd:boolean && ?centro=false) || lcase(str(?centro))="false") )\n',
        where_txt, flags=re.IGNORECASE
    )
    # remove junk like ") AND ?m ex:hasBandGap ?bandgap)"
    where_txt = re.sub(r'\)\s*AND\s*\?m\s+ex:hasBandGap\s+\?bandgap\)\s*', '', where_txt, flags=re.IGNORECASE)

    # --- Anchor & binding policy (Bandgap KG)
    if " a ex:material" not in where_txt.lower():
        where_txt = "?m a ex:Material .\n" + where_txt

    def ensure_required(txt: str, pattern: str, var: str) -> str:
        req = f"?m ex:{pattern} ?{var} ."
        txt = re.sub(rf"OPTIONAL\s*\{{\s*\?m\s+ex:{pattern}\s+\?{var}\s*\}}\s*", "", txt, flags=re.IGNORECASE)
        if req not in txt:
            txt = req + "\n" + txt
        return txt

    def ensure_optional(txt: str, pattern: str, var: str) -> str:
        req = f"?m ex:{pattern} ?{var} ."
        if req in txt:
            return txt
        if f"ex:{pattern}".lower() not in txt.lower():
            txt += f"\nOPTIONAL {{ ?m ex:{pattern} ?{var} }}"
        return txt

    wl = where_txt.lower()
    uses_bg_filter   = ("bandgap" in wl) and ("filter" in wl)
    uses_centro_flt  = ("centro" in wl) and ("filter" in wl)
    uses_crys_flt    = ("crystal_system" in wl) and ("filter" in wl)

    # ALWAYS required for Bandgap KG:
    where_txt = ensure_required(where_txt, "hasFormula", "formula")
    where_txt = ensure_required(where_txt, "hasBandGap", "bandgap")

    # Conditional required/optional:
    where_txt = ensure_required(where_txt, "hasCentrosymmetric", "centro") if uses_centro_flt else ensure_optional(where_txt, "hasCentrosymmetric", "centro")
    where_txt = ensure_required(where_txt, "hasCrystalSystem", "crystal_system") if uses_crys_flt else ensure_optional(where_txt, "hasCrystalSystem", "crystal_system")

    # Soft extras
    if "rdfs:label" not in wl:
        where_txt += "\nOPTIONAL { ?m rdfs:label ?label }"
    if "ex:statedin" not in wl:
        where_txt += "\nOPTIONAL { ?m ex:statedIn ?source }"
        where_txt += "\nOPTIONAL { ?source rdfs:label ?source_label }"
        where_txt += "\nOPTIONAL { ?source ex:hasProvenanceId ?source_id }"
    if "ex:ingesttime" not in wl:
        where_txt += "\nOPTIONAL { ?m ex:ingestTime ?ingest_time }"
    if "ex:ingestindex" not in wl:
        where_txt += "\nOPTIONAL { ?m ex:ingestIndex ?ingest_idx }"

    # Robust guards
    where_txt = where_txt.replace(
        "FILTER(?centro = false)",
        'FILTER( BOUND(?centro) && ((datatype(?centro)=xsd:boolean && ?centro=false) || lcase(str(?centro))="false") )'
    )
    where_txt = where_txt.replace(
        "FILTER(xsd:float(?bandgap)", "FILTER( BOUND(?bandgap) && xsd:float(?bandgap)"
    ).replace(
        "FILTER(BOUND(xsd:float(?bandgap))", "FILTER( BOUND(?bandgap)"
    )

    head = SPARQL_PREFIX + "SELECT ?m ?label ?formula ?bandgap ?crystal_system ?centro ?source_label ?source_id\n"
    q_clean = f"{head}WHERE {{\n  {where_txt.strip()}\n}}\n"
    if tail_order:
        q_clean += "\n".join(tail_order) + "\n"
    return q_clean

def nl_to_sparql(question: str, model=None):
    model = model or _MODEL
    prompt = f"""{SPARQL_PREFIX}
# Question:
{question}
# Write a valid SPARQL SELECT:"""
    resp = ollama.chat(
        model=model,
        messages=[{"role":"system","content": NL2SPARQL_SYSTEM},
                  {"role":"user","content": prompt}],
        options={'temperature': 0, 'num_ctx': 1024}
    )
    q = resp['message']['content'].strip()
    if q.startswith("```"):
        q = q.strip("`").split("\n",1)[1]
    return sanitize_sparql(q)

##########################------Execution wrappers ------##################################
def run_sparql(query: str):
    qres = g.query(query)
    cols = [str(v) for v in qres.vars]
    rows = [{str(k): (str(v) if v is not None else None) for k, v in zip(cols, r)} for r in qres]
    return pd.DataFrame(rows, columns=cols)


def ask_kg(question: str,
           n: int | None = None,
           last_n: bool = False,             # kept for API compatibility; ignored if window is set
           window: int | None = None,        # e.g., 100 → pre-filter to last 100 ingested
           pick: str = "last",               # "first" or "last" after windowing
           model=None):

    import re
    model = model or _MODEL

    # 1) NL → sanitized SPARQL (already SELECT ... WHERE { ... })
    sparql0 = nl_to_sparql(question, model=model)

    # 2) Extract WHERE body only (remove SELECT/PREFIX/ORDER/LIMIT)
    def _extract_where_body(q: str) -> str:
        # kill code fences
        q = q.strip()
        if q.startswith("```"):
            q = q.strip("`").split("\n", 1)[1].strip()

        # remove PREFIX lines
        lines = [ln for ln in q.splitlines() if not ln.strip().lower().startswith("prefix")]
        q = "\n".join(lines)

        # pull the chunk inside the outermost WHERE { ... }
        if "WHERE" not in q:
            return ""
        after_where = q.split("WHERE", 1)[1]
        lb = after_where.find("{")
        rb = after_where.rfind("}")
        body = after_where[lb+1:rb].strip() if (lb >= 0 and rb >= 0) else ""

        # drop any trailing ORDER/LIMIT the LLM might’ve put inside the body
        body = re.sub(r'(?is)\bORDER\s+BY\b.*$', '', body).strip()
        body = re.sub(r'(?is)\bLIMIT\s+\d+\s*$', '', body).strip()
        return body

    body = _extract_where_body(sparql0)

    # 3) Build a VALID query with an inner sub-SELECT for the recency window
    #    If window is None → no subselect; else wrap in subselect+ORDER+LIMIT
    subselect = ""
    join_ingest_bindings = ""
    if window is not None:
        subselect = (
            "  {\n"
            "    SELECT DISTINCT ?m ?ingest_time ?ingest_idx WHERE {\n"
            "      ?m a ex:Material .\n"
            "      OPTIONAL { ?m ex:ingestTime  ?ingest_time }\n"
            "      OPTIONAL { ?m ex:ingestIndex ?ingest_idx }\n"
            "    }\n"
            "    ORDER BY DESC(?ingest_time) DESC(?ingest_idx)\n"
            f"    LIMIT {int(window)}\n"
            "  }\n"
        )
        join_ingest_bindings = ""  # body already refers to ?m; ingest vars are available from subselect
    else:
        # no window: just ensure the ingest vars are optionally bound so ORDER BY works
        join_ingest_bindings = (
            "  OPTIONAL { ?m ex:ingestTime  ?ingest_time }\n"
            "  OPTIONAL { ?m ex:ingestIndex ?ingest_idx }\n"
        )

    # 4) Compose a clean SELECT head + WHERE { subselect + body (+optional binds) } + ORDER/LIMIT
    head = (
        SPARQL_PREFIX +
        "SELECT DISTINCT ?m ?label ?formula ?bandgap ?crystal_system ?centro ?source_label ?source_id ?ingest_time ?ingest_idx\n"
    )

    # Guarantee OPTIONAL projection bindings exist (cheap safety)
    def _ensure_optional_block(txt: str, triple: str) -> str:
        # if triple is not present, add OPTIONAL {...}
        if triple not in txt:
            return txt + f"  OPTIONAL {{ {triple} }}\n"
        return txt

    # Ensure OPTIONAL blocks for label/formula/bandgap/system/centro/source
    # (your sanitizer usually does this, but this keeps the wrapper robust)
    core_optionals = ""
    core_optionals = _ensure_optional_block(core_optionals, "?m rdfs:label ?label")
    core_optionals = _ensure_optional_block(core_optionals, "?m ex:hasFormula ?formula")
    core_optionals = _ensure_optional_block(core_optionals, "?m ex:hasBandGap ?bandgap")
    core_optionals = _ensure_optional_block(core_optionals, "?m ex:hasCrystalSystem ?crystal_system")
    core_optionals = _ensure_optional_block(core_optionals, "?m ex:hasCentrosymmetric ?centro")
    core_optionals += "  OPTIONAL { ?m ex:statedIn ?source }\n"
    core_optionals += "  OPTIONAL { ?source rdfs:label ?source_label }\n"
    core_optionals += "  OPTIONAL { ?source ex:hasProvenanceId ?source_id }\n"

    where_block = "WHERE {\n" + subselect
    # Always anchor to Material; the body may already add it, duplicate is harmless
    where_block += "  ?m a ex:Material .\n"
    if body:
        where_block += "  " + body + "\n"
    where_block += join_ingest_bindings
    where_block += core_optionals
    where_block += "}\n"

    order_block = "ORDER BY DESC(?ingest_time) DESC(?ingest_idx)\n"

    # Apply outer LIMIT for n (final slice); we’ll also dedupe by formula in Python after execution
    limit_block = f"LIMIT {int(n)}\n" if n is not None else ""

    sparql = head + where_block + order_block + limit_block

    print("SPARQL (sanitized):\n", sparql)

    # 5) Execute
    df = run_sparql(sparql)

    # 6) De-dupe by formula (keep first row, which is most recent due to ORDER BY)
    if "formula" in df.columns:
        df = df.dropna(subset=["formula"])
        # if user asked pick="first", we already ordered DESC; “first” means topmost
        # if pick="last", reverse within this window before de-dupe to keep oldest in the window
        if pick.lower() == "last":
            df = df.iloc[::-1]
        df = df.drop_duplicates(subset=["formula"], keep="first")
        # after dedupe, re-apply n if not applied in SPARQL
        if n is not None:
            df = df.head(n)

        # restore descending recency if we reversed
        if pick.lower() == "last":
            df = df.iloc[::-1].reset_index(drop=True)
        else:
            df = df.reset_index(drop=True)

    return df

***`Parser function`***   

**One text**  
res = parse_to_kg("ZnO wurtzite; band gap ≈ 3.3 eV; non-centrosymmetric.")
res

**From a URL {'url': ...}**
res = parse_to_kg({"url": "https://example.com/paper-abstract"}, source_label="paper", source_id="doi:10.1234/foo")
res

**Batch**  
demo_texts = [  
    "β-Ga2O3 has a band gap around 4.8 eV and is monoclinic; it's centrosymmetric. materials_id: funny:001",  
    "InP (zinc blende) band gap ~1.34 eV; non-centrosymmetric.",  
    "ZnO has band gap ≈ 3.3 eV; is non-centrosymmetric & wurtzite.",  
]  
df_summary = parse_many_to_kg(texts, start_idx=1_000_000, source_label="fabricated_demo")  
df_summary.head()


In [None]:
# ---- Parse natural language → triples in KG ----
# Inputs can be:
#   - plain text:        "ZnO wurtzite; Eg~3.3 eV; non-centrosymmetric"
#   - dict with URL:     {"url": "https://paper.example/abstract", "note": "foo"}
#   - dict with text:    {"text": "...", "source_id": "my:tag"}
#
# Returns a dict with success/metadata, and (for batch) a pandas DataFrame summary.

from typing import Any, Iterable, Optional
import pandas as pd
from datetime import datetime, timezone

def parse_to_kg(input_obj: Any,
                *,
                idx: Optional[int] = None,
                source_label: Optional[str] = None,
                source_id: Optional[str] = None,
                dry_run: bool = False) -> dict:
    """
    Parse one item (text or {text|url,...}) with Ollama and (optionally) ingest into the KG.

    Parameters
    ----------
    input_obj : str | dict
        The text to parse, or a dict with "text" or "url" (and optional extra metadata).
    idx : int | None
        Ingest index to stamp (if None, we won't add ex:ingestIndex).
    source_label : str | None
        Human-friendly provenance label (e.g., paper title, "arXiv abstract", etc.)
    source_id : str | None
        Stable provenance id (e.g., DOI, URL hash, internal tag).
    dry_run : bool
        If True, do not mutate the KG; only return what would be ingested.

    Returns
    -------
    dict
      {
        "ok": bool,
        "reason": str | None,
        "row": RowOut | None,
        "material_iri": rdflib.term.Identifier | None,
        "added_triples": int,
        "source_label": str | None,
        "source_id": str | None,
      }
    """
    # allow plain strings
    if isinstance(input_obj, str):
        input_obj = {"text": input_obj}

    # Normalize → RowOut (uses your STRICT_SYSTEM + fabrication of material_id if missing)
    try:
        nr = normalize_row_with_ollama(input_obj)
    except Exception as e:
        return {"ok": False, "reason": f"normalize failed: {e}", "row": None,
                "material_iri": None, "added_triples": 0,
                "source_label": source_label, "source_id": source_id}

    # If caller forgot provenance, try to glean from input_obj
    if isinstance(input_obj, dict):
        if source_label is None:
            source_label = input_obj.get("source_label") or input_obj.get("title") or "text"
        if source_id is None:
            source_id = input_obj.get("source_id") or input_obj.get("url") or _extract_material_id_from_text(str(input_obj.get("text","")))

    # Ingest (with diff-count)
    before = set(g)
    material_iri = None
    try:
        if dry_run:
            # no mutation: just report what WOULD be added
            return {"ok": True, "reason": "dry_run", "row": nr,
                    "material_iri": None, "added_triples": 0,
                    "source_label": source_label, "source_id": source_id}
        material_iri = ingest_normalized_row(
            nr,
            idx=(idx if idx is not None else int(datetime.now(timezone.utc).timestamp())),
            source_label=source_label,
            source_id=source_id
        )
    except Exception as e:
        return {"ok": False, "reason": f"ingest failed: {e}", "row": nr,
                "material_iri": None, "added_triples": 0,
                "source_label": source_label, "source_id": source_id}

    added = len(set(g) - before)
    return {"ok": True, "reason": None, "row": nr,
            "material_iri": material_iri, "added_triples": added,
            "source_label": source_label, "source_id": source_id}


def parse_many_to_kg(items: Iterable[Any],
                     *,
                     start_idx: int = 1_000_000,
                     source_label: Optional[str] = None,
                     dry_run: bool = False) -> pd.DataFrame:
    """
    Batch parse & ingest a list of items (texts or dicts).

    Each item gets idx = start_idx + i to keep ingest order stable.
    If source_label is given, it's used for all; otherwise per-item label is inferred.

    Returns a DataFrame with one row per item.
    """
    rows = []
    total_before = len(g)
    for i, it in enumerate(items, 1):
        res = parse_to_kg(
            it,
            idx=(start_idx + i),
            source_label=source_label,
            source_id=(None if isinstance(it, str) else it.get("source_id") if isinstance(it, dict) else None),
            dry_run=dry_run
        )
        rows.append({
            "i": i,
            "ok": res["ok"],
            "reason": res["reason"],
            "material": (res["row"].material if res["ok"] and res["row"] else None),
            "formula": (res["row"].formula if res["ok"] and res["row"] else None),
            "band_gap_eV": (res["row"].band_gap_eV if res["ok"] and res["row"] else None),
            "crystal_system": (res["row"].crystal_system if res["ok"] and res["row"] else None),
            "is_centrosymmetric": (res["row"].is_centrosymmetric if res["ok"] and res["row"] else None),
            "material_id": (res["row"].material_id if res["ok"] and res["row"] else None),
            "added_triples": res["added_triples"],
            "source_label": res["source_label"],
            "source_id": res["source_id"],
            "iri": str(res["material_iri"]) if res["material_iri"] else None,
        })
    total_after = len(g)
    df = pd.DataFrame(rows)
    if not dry_run:
        print(f"Batch added triples: {total_after - total_before} (across {len(rows)} items)")
    else:
        print(f"[dry_run] Would add triples for {len(rows)} items (graph unchanged)")
    return df

*Ingest a new CSV (dedupe by material_id→formula)*  
**Usage:**  
*ingest_new_csv("../data/new_semiconductors.csv", update_df=False)  # KG only*  
*ingest_new_csv("../data/new_semiconductors.csv", update_df=True)   # also append to df*

In [23]:
# import pandas as pd

# def _norm_str(x):
#     return str(x).strip() if pd.notna(x) and str(x).strip() not in {"", "nan", "None"} else None

# def ingest_new_csv(csv_path: str, update_df: bool = False):
#     new_df = pd.read_csv(csv_path)

#     # align columns if needed
#     rename_map = {
#         "band_gap": "band_gap_eV",
#         "crystalsystem": "crystal_system",
#         "centrosymmetric": "is_centrosymmetric",
#     }
#     new_df = new_df.rename(columns=rename_map)

#     added = 0
#     for i, r in new_df.iterrows():
#         nr = RowOut(
#             material          = _norm_str(r.get("formula")) or _norm_str(r.get("material_id")) or f"Material_new_{i}",
#             formula           = _norm_str(r.get("formula")),
#             material_id       = _norm_str(r.get("material_id")),
#             crystal_system    = _norm_str(r.get("crystal_system")),
#             is_centrosymmetric= (bool(r["is_centrosymmetric"]) if pd.notna(r.get("is_centrosymmetric")) else None),
#             band_gap_eV       = (float(r["band_gap_eV"]) if pd.notna(r.get("band_gap_eV")) else None),
#         )
#         # idempotent + dedup-aware
#         ingest_normalized_row(nr, idx=1_000_000 + i, source_label="csv_import", source_id=csv_path)
#         added += 1

#     if update_df:
#         global df
#         # naive append; optional real dedupe if you want:
#         # df = pd.concat([df, new_df], ignore_index=True).drop_duplicates(subset=["material_id","formula"], keep="first")
#         df = pd.concat([df, new_df], ignore_index=True)

#     print(f"Ingested {added} rows from {csv_path}. Triples now: {len(g)}")
#     return added

# KG with the whole dataframe.

In [None]:
# Bulk-ingest entire DataFrame into KG (silent)

import pandas as pd
from pathlib import Path
import gzip
from rdflib import URIRef, BNode, Literal, RDF, RDFS
from pyvis.network import Network

def _get_str(x):
    return str(x).strip() if x is not None and str(x).strip() not in {"", "nan", "None"} else None

# --- 1) Ingest whole DataFrame (no per-row prints) ---
ingested = 0
for i, r in df.iterrows():
    nr = RowOut(
        material = _get_str(r.get("formula")) or _get_str(r.get("material_id")) or f"Material_{i}",
        formula  = _get_str(r.get("formula")),
        material_id = _get_str(r.get("material_id")),
        crystal_system = _get_str(r.get("crystal_system")),
        is_centrosymmetric = (bool(r["is_centrosymmetric"]) if pd.notna(r.get("is_centrosymmetric")) else None),
        band_gap_eV = (float(r["band_gap_eV"]) if pd.notna(r.get("band_gap_eV")) else None),
    )
    ingest_normalized_row(nr, idx=i)
    ingested += 1

# --- 2) Save KG (TTL, TTL.GZ, N-Triples) ---
OUT_DIR = Path("../data")
OUT_DIR.mkdir(parents=True, exist_ok=True)
TTL_PATH   = OUT_DIR / "mse_kg_full.ttl"
TTL_GZPATH = OUT_DIR / "mse_kg_full.ttl.gz"
NT_PATH    = OUT_DIR / "mse_kg_full.nt"

ttl_bytes = g.serialize(format="turtle", encoding="utf-8")
TTL_PATH.write_bytes(ttl_bytes)
with gzip.open(TTL_GZPATH, "wb") as f:
    f.write(ttl_bytes)

nt_bytes = g.serialize(format="nt", encoding="utf-8")
NT_PATH.write_bytes(nt_bytes)

# --- 3) Save lightweight interactive HTML view (limited edges for performance) ---
def _label(term):
    lab = g.value(term, RDFS.label)
    if lab: return str(lab)
    if isinstance(term, URIRef):
        try: return g.namespace_manager.normalizeUri(term)
        except: pass
        s=str(term); return s.rsplit("#",1)[-1].rsplit("/",1)[-1]
    return str(term)

def _nid(term):
    if isinstance(term, (URIRef, BNode)): return str(term)
    return f"lit:{hash((str(term), type(term).__name__))}"

def _style(term):
    types = set(g.objects(term, RDF.type)) if isinstance(term, (URIRef, BNode)) else set()
    def has(local): return any(str(t).endswith(f"#{local}") or str(t).endswith(f"/{local}") for t in types)
    if has("Material"):         return dict(color="#2b8a3e", shape="ellipse")
    if has("CrystalStructure"): return dict(color="#1c7ed6", shape="ellipse")
    if isinstance(term, Literal): return dict(color="#bfbfbf", shape="box")
    return dict(color="#666666", shape="ellipse")

def save_html_subset(g, out_html:str, max_edges:int=5000, show_literals:bool=False, height:str="800px"):
    net = Network(height=height, width="100%", directed=True, notebook=True, cdn_resources="in_line")
    net.toggle_physics(True)
    added=set(); edges=0
    for s,p,o in g.triples((None,None,None)):
        if edges>=max_edges: break
        if not show_literals and isinstance(o, Literal):
            continue
        sid=_nid(s); oid=_nid(o)
        if sid not in added:
            net.add_node(sid, label=_label(s), **_style(s)); added.add(sid)
        if show_literals or not isinstance(o, Literal):
            if oid not in added:
                net.add_node(oid, label=_label(o), **_style(o)); added.add(oid)
        pred = _label(p).replace("ex:","").replace("hasBandGap","band_gap_eV")\
                        .replace("hasCrystalSystem","crystal_system")\
                        .replace("hasCentrosymmetric","centrosymmetric")\
                        .replace("hasFormula","formula").replace("hasExternalId","material_id")
        net.add_edge(sid, oid, label=pred); edges+=1
    net.show(out_html)

HTML_PATH = OUT_DIR / "kg_full.html"
save_html_subset(g, str(HTML_PATH), max_edges=5000, show_literals=False)

# --- 4) Final single-line summary (no verbosity) ---
print(f"Ingested {ingested} rows → Triples: {len(g)} | Saved: {TTL_PATH.name}, {TTL_GZPATH.name}, {NT_PATH.name}, {HTML_PATH.name}")

..\data\kg_full.html
Ingested 150987 rows → Triples: 1097053 | Saved: mse_kg_full.ttl, mse_kg_full.ttl.gz, mse_kg_full.nt, kg_full.html


In [19]:
df.shape

(150987, 6)

# Some queries

In [18]:
run_sparql("""
PREFIX ex: <http://example.org/mse#>
SELECT (COUNT(*) AS ?n) WHERE { ?m ex:hasBandGap ?bandgap }
""")


Unnamed: 0,n
0,132877


In [20]:
# How many Material nodes?
run_sparql("""
PREFIX ex:<http://example.org/mse#>
SELECT (COUNT(DISTINCT ?m) AS ?materials)
WHERE { ?m a ex:Material }
""")

# How many materials have a band gap?
run_sparql("""
PREFIX ex:<http://example.org/mse#>
SELECT (COUNT(DISTINCT ?m) AS ?materials_with_Eg)
WHERE { ?m ex:hasBandGap ?bandgap }
""")

# (In pandas) how many rows had a non-null band_gap_eV?
df['band_gap_eV'].notna().sum()

# How many unique materials remained after getting filtered by dedupe key(s)?
df['material_id'].nunique(), df['formula'].nunique()


(150987, 102415)

*Some important queries for reference*

In [None]:
###1) One row per material by most-recent ingest (per formula)--when the latest fact set for each formula, within a recency window.

run_sparql(
"""PREFIX ex:   <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd:  <http://www.w3.org/2001/XMLSchema#>

# --- window: only consider the latest N materials by ingest meta
# change LIMIT 100 to your window size
SELECT ?m ?formula ?bandgap ?crystal_system ?centro ?label ?source_label ?source_id ?ingest_time ?ingest_idx
WHERE {
  { SELECT DISTINCT ?m ?formula ?ingest_time ?ingest_idx
    WHERE {
      { SELECT DISTINCT ?m ?ingest_time ?ingest_idx WHERE {
          ?m a ex:Material .
          OPTIONAL { ?m ex:ingestTime  ?ingest_time }
          OPTIONAL { ?m ex:ingestIndex ?ingest_idx }
        }
        ORDER BY DESC(?ingest_time) DESC(?ingest_idx)
        LIMIT 100
      }
      ?m ex:hasFormula ?formula .
      OPTIONAL { ?m ex:ingestTime  ?ingest_time }
      OPTIONAL { ?m ex:ingestIndex ?ingest_idx }

      # keep only the most recent row for this formula
      FILTER NOT EXISTS {
        ?m2 a ex:Material ;
            ex:hasFormula ?formula .
        OPTIONAL { ?m2 ex:ingestTime  ?t2 }
        OPTIONAL { ?m2 ex:ingestIndex ?i2 }
        FILTER(
          COALESCE(?t2, xsd:dateTime("0001-01-01T00:00:00Z")) >  COALESCE(?ingest_time, xsd:dateTime("0001-01-01T00:00:00Z"))
          ||
          ( COALESCE(?t2, xsd:dateTime("0001-01-01T00:00:00Z")) = COALESCE(?ingest_time, xsd:dateTime("0001-01-01T00:00:00Z"))
            && COALESCE(?i2, -1) > COALESCE(?ingest_idx, -1)
          )
        )
      }
    }
  }

  # now attach properties for that chosen material node
  ?m ex:hasBandGap ?bandgap .
  OPTIONAL { ?m ex:hasCrystalSystem ?crystal_system }
  OPTIONAL { ?m ex:hasCentrosymmetric ?centro }
  OPTIONAL { ?m rdfs:label ?label }
  OPTIONAL { ?m ex:statedIn ?source .
            OPTIONAL { ?source rdfs:label       ?source_label }
            OPTIONAL { ?source ex:hasProvenanceId ?source_id } }

  # example filters – tweak as you like
  FILTER( xsd:float(?bandgap) > 3.0 )
  FILTER( !BOUND(?centro) || (datatype(?centro)=xsd:boolean && ?centro=false) || lcase(str(?centro))="false" )
}
ORDER BY DESC(?ingest_time) DESC(?ingest_idx)
LIMIT 10
"""


)

### when  “the highest reported Eg per formula” (and still show one row). is wanted
run_sparql(
""" 
PREFIX ex:   <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd:  <http://www.w3.org/2001/XMLSchema#>

# --- window: latest N materials first (optional but keeps things fast)
# change LIMIT 100 to your window size, or remove the subselect to scan full KG
WITH {
  SELECT DISTINCT ?m ?ingest_time ?ingest_idx WHERE {
    ?m a ex:Material .
    OPTIONAL { ?m ex:ingestTime  ?ingest_time }
    OPTIONAL { ?m ex:ingestIndex ?ingest_idx }
  }
  ORDER BY DESC(?ingest_time) DESC(?ingest_idx)
  LIMIT 100
} AS %win

# compute MAX Eg per formula across the window
SELECT ?formula (MAX(xsd:float(?bandgap)) AS ?bandgap)
       (SAMPLE(?crystal_system) AS ?crystal_system)
       (SAMPLE(?centro) AS ?centro)
       (SAMPLE(?m) AS ?any_m)
WHERE {
  INCLUDE %win
  ?m ex:hasFormula ?formula .
  ?m ex:hasBandGap ?bandgap .
  OPTIONAL { ?m ex:hasCrystalSystem ?crystal_system }
  OPTIONAL { ?m ex:hasCentrosymmetric ?centro }

  # require that some material with this formula is non-centrosymmetric
  FILTER EXISTS {
    ?m2 ex:hasFormula ?formula ;
        ex:hasCentrosymmetric ?c2 .
    FILTER( (datatype(?c2)=xsd:boolean && ?c2=false) || lcase(str(?c2))="false" )
  }
}
GROUP BY ?formula
HAVING (MAX(xsd:float(?bandgap)) > 3.0)
ORDER BY DESC(?bandgap)
LIMIT 10
""")

**Sanity-check fot LLM Wiring**  
Demo: add a few new entries from text, then query  

In [19]:
texts = [
    "Ga2O3N5Cl7 has a band gap around 4.8 eV and is monoclinic; it's centrosymmetric. materials_id: funny:001,",
    "Ga8O3N5Cl7 band gap ~1.34 eV; non-centrosymmetric & hexagonal; materials_id: funny:002,",
    "Ga5O33N5Cl7 has band gap ≈ 3.3 eV; is non-centrosymmetric & cubic; materials_id: funny:003,",
] 
df_summary = parse_many_to_kg(texts, start_idx=1_000_000, source_label="fabricated_demo")  
df_summary = df_summary.iloc[:, 4:-1]
df_summary.head()

Batch added triples: 36 (across 3 items)


Unnamed: 0,formula,band_gap_eV,crystal_system,is_centrosymmetric,material_id,added_triples,source_label,source_id
0,Ga2O3N5Cl7,4.8,monoclinic,True,funny:001,12,fabricated_demo,funny:001
1,Ga8O3N5Cl7,1.34,hexagonal,False,funny:002,12,fabricated_demo,funny:002
2,Ga5O33N5Cl7,3.3,cubic,False,funny:003,12,fabricated_demo,funny:003


**Material diagnosis via Formula**

In [20]:
def show_material(f):
    m = next(g.subjects(hasFormula, Literal(f, datatype=XSD.string)), None)
    if not m:
        print("No node with formula:", f); return
    print("Node:", m)
    for (p, name) in [(hasFormula,"formula"),
                      (hasBandGap,"bandgap"),
                      (hasCrystalSystem,"crystal_system"),
                      (hasCentrosymmetric,"centro")]:
        vals = [str(o) for o in g.objects(m, p)]
        print(f"  {name}: {vals if vals else '—'}")

show_material("Ga5O33N5Cl7")
show_material("Ga8O3N5Cl7")
show_material("Ga2O3N5Cl73")  

Node: http://example.org/mse#Ga5O33N5Cl7
  formula: ['Ga5O33N5Cl7']
  bandgap: ['3.3']
  crystal_system: ['cubic']
  centro: ['false']
Node: http://example.org/mse#Ga8O3N5Cl7
  formula: ['Ga8O3N5Cl7']
  bandgap: ['1.34']
  crystal_system: ['hexagonal']
  centro: ['false']
No node with formula: Ga2O3N5Cl73


In [None]:
ask_kg(
  "List non-centrosymmetric semiconductors with band gap > 3 eV, show formula and crystal system.",
  window=100, pick="last", n=5
)

SPARQL (sanitized):
 PREFIX ex: <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT DISTINCT ?m ?label ?formula ?bandgap ?crystal_system ?centro ?source_label ?source_id ?ingest_time ?ingest_idx
WHERE {
  {
    SELECT DISTINCT ?m ?ingest_time ?ingest_idx WHERE {
      ?m a ex:Material .
      OPTIONAL { ?m ex:ingestTime  ?ingest_time }
      OPTIONAL { ?m ex:ingestIndex ?ingest_idx }
    }
    ORDER BY DESC(?ingest_time) DESC(?ingest_idx)
    LIMIT 100
  }
  ?m a ex:Material .
  ?m ex:hasBandGap ?bandgap .
?m a ex:Material .

?m ex:hasCentrosymmetric ?centro .
FILTER( BOUND(?centro) && ((datatype(?centro)=xsd:boolean && ?centro=false) || lcase(str(?centro))="false") )
?m ex:hasFormula ?formula .
?m ex:hasCrystalSystem ?crystal_system .

OPTIONAL { ?m rdfs:label ?label }
OPTIONAL { ?m ex:statedIn ?source }
OPTIONAL { ?source rdfs:label ?source_label }
OPTIONAL { ?source ex:hasProvenanceId ?source_id }
OPTI