# Mini Materials Knowledge Graph — Common Semiconductors
**What I'm building:** a tiny pipeline that turns a small semiconductors table into **RDF triples**, then I query it using **SPARQL**. I keep the ontology minimal and readable.


## Some basic terminologies
- **Ontology** → my small schema + vocabulary for this domain (classes + relations).
- **RDF (Resource Description Framework) triple** → one fact written as `subject — predicate — object`.
- **Namespace** → URL prefix so my identifiers are unique.
- **SPARQL** → my query tool for RDF graphs (like SQL but for triples).
- **Turtle (.ttl)** → compact text format to store RDF.
- **IRI(Internationalized Resource Identifier)** → a **Unicode string** that serves as a unique, global name or identifier for an entity.
- **Regex (Regular Expression):** -> A specialized mini-language for describing text by using combination of 'Literal' & 'Meta' characters & allowing  find, match, replace, and validate strings of text by specifying rules for character combinations. Literal characters: These match themselves exactly ("cat" would match the literal string "cat"). Metacharacters: special characters having a specific meaning within the regex pattern, such as:
(dot '.'): Matches any single character. 
(asterisk '*'): Matches the preceding character zero or more times. 
(plus '+'): Matches the preceding character one or more times. 
(question mark '?'): Matches the preceding character zero or one time. 
(pipe '|'): Acts as an "OR" operator, allowing you to match one pattern or another. 
\d: Matches a digit (0-9). 

**Imports**

In [None]:
# import os; print("OK" if os.getenv("OPENAI_API_KEY") else "MISSING")

OK


In [1]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pathlib import Path
import re
from dotenv import load_dotenv
load_dotenv()
import matplotlib.pyplot as plt
import json, ast
from matminer.featurizers.structure import GlobalSymmetryFeatures
from pymatgen.core import Structure
import openpyxl
from graphviz import Digraph

# 0) Dataset prep. for KG   
The dataset I am using is parsed from Materials Project database, where 'Structure' is pymatgen’s pretty-printed “Structure summary”, not a standard format (CIF/POSCAR/JSON). Therefore 'Regex' helpers was used to decode that & subsequently a Structure object is created. The structure object was created to use 'Matminer's Structure featurizer, from which I am interested to extract the 'crystal_system' & 'Whether the structure has an inversion center'. I find that to be a valuable information for the knowledge graph I'm gonna build.

In [16]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from pymatgen.core import Structure, Lattice
from matminer.featurizers.structure import GlobalSymmetryFeatures

# -------- Paths (notebook in .../notebooks, data in .../data) ----------
DATA_DIR   = Path("../data").resolve()
INPUT_XLSX = DATA_DIR / "full_dataset_Bandgap_0_to_5.xlsx"
OUTPUT_CSV = DATA_DIR / "full_dataset_Bandgap_0_to_5_featurized.csv"

# -------- Regex helpers for the pretty string ----------
re_abc     = re.compile(r"abc\s*:\s*([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)", re.I)
re_angles  = re.compile(r"angles\s*:\s*([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)", re.I)
re_pbc     = re.compile(r"pbc\s*:\s*(True|False)\s+(True|False)\s+(True|False)", re.I)
re_sites_header = re.compile(r"Sites\s*\(\d+\)", re.I)
re_site_row = re.compile(
    r"^\s*\d+\s+([A-Za-z][a-z]?)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)\s+([-\d\.Ee+]+)",
    re.I
)

def parse_pretty_structure(txt):
    """Parse pymatgen's pretty-printed Structure summary into a Structure."""
    if not isinstance(txt, str):
        return None
    s = txt.strip()

    # Lattice params
    m_abc = re_abc.search(s)
    m_ang = re_angles.search(s)
    if not (m_abc and m_ang):
        return None
    a, b, c = map(float, m_abc.groups())
    alpha, beta, gamma = map(float, m_ang.groups())

    # PBC (optional; default True,True,True)
    m_pbc = re_pbc.search(s)
    pbc = tuple(map(lambda x: x.lower()=="true", m_pbc.groups())) if m_pbc else (True, True, True)

    # Sites block: find header, then parse subsequent lines
    lines = s.splitlines()
    try:
        start_idx = next(i for i, ln in enumerate(lines) if re_sites_header.search(ln))
    except StopIteration:
        return None

    species, frac_coords = [], []
    for ln in lines[start_idx+1:]:
        m = re_site_row.match(ln)
        if not m:
            continue
        sp, fa, fb, fc = m.groups()
        species.append(sp)
        frac_coords.append([float(fa), float(fb), float(fc)])

    if not species:
        return None

    # Build lattice & structure
    lat = Lattice.from_parameters(a=a, b=b, c=c, alpha=alpha, beta=beta, gamma=gamma)
    try:
        # pymatgen Structure ignores PBC per-axis in initializer; periodicity is assumed.
        # If you truly need non-periodic axes, you'd manage it downstream; for symmetry it's fine.
        struct = Structure(lattice=lat, species=species, coords=frac_coords, coords_are_cartesian=False)
        return struct
    except Exception:
        return None

# -------- Load Excel ----------
df = pd.read_excel(INPUT_XLSX)

# Choose/confirm structure column (adjust if yours is named differently)
struct_col = "structure" if "structure" in df.columns else None
if struct_col is None:
    # try a best-guess search for a column that contains the pretty summary
    candidates = [c for c in df.columns if df[c].astype(str).str.contains(r"Full Formula|Reduced Formula|Sites \(", regex=True, na=False).any()]
    struct_col = candidates[0] if candidates else None

if struct_col is None:
    raise ValueError(f"Could not find the structure column. Available columns: {list(df.columns)}")

# -------- Parse structures --------
parsed = df[struct_col].apply(parse_pretty_structure)

parsed_ok = parsed.apply(lambda x: isinstance(x, Structure))
print(f"Parsed {parsed_ok.sum()} / {len(parsed)} structures ({100*parsed_ok.mean():.1f}%).")

if parsed_ok.sum() == 0:
    raise RuntimeError("Parser could not reconstruct any structures from the pretty string format. "
                       "Please share one exact cell (as plain text) or consider storing CIF/JSON for structures.")

# Replace column with parsed Structure objects
df[struct_col] = parsed

# -------- Featurize (two features only) --------
gsf = GlobalSymmetryFeatures()
labels = gsf.feature_labels()  # ['spacegroup_num','crystal_system','crystal_system_int','is_centrosymmetric','n_symmetry_ops']

records = []
for s in df[struct_col]:
    if isinstance(s, Structure):
        try:
            vals = gsf.featurize(s)
            rec = dict(zip(labels, vals))
        except Exception:
            rec = {lbl: np.nan for lbl in labels}
    else:
        rec = {lbl: np.nan for lbl in labels}
    records.append(rec)

feat_df = pd.DataFrame(records)
out_df = pd.concat([df.reset_index(drop=True),
                    feat_df[["crystal_system", "is_centrosymmetric"]].reset_index(drop=True)], axis=1)

# -------- Save --------
out_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved: {OUTPUT_CSV}")
print(out_df[["crystal_system", "is_centrosymmetric"]].isna().mean())
out_df.head()

Parsed 150987 / 150987 structures (100.0%).
Saved: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5_featurized.csv
crystal_system        0.0
is_centrosymmetric    0.0
dtype: float64


Unnamed: 0,material_id,formula,band_gap,structure,crystal_system,is_centrosymmetric
0,mp-10018,Ac,0.0,[[0. 0. 0.] Ac],cubic,True
1,mp-1183057,Ac,0.0,[[1.31096178e+00 2.27065255e+00 3.21093059e-16...,trigonal,False
2,mp-1183069,Ac,0.0,"[[0. 0. 0.] Ac, [2.65892229 0.77478234 2.89079...",trigonal,True
3,mp-862690,Ac,0.0,"[[0. 0. 0.] Ac, [-3.26165376e-07 2.33598734e+...",hexagonal,True
4,mp-861724,Ac2AgIr,0.0,"[[3.10657746 2.19668199 5.38075 ] Ac, [0. 0....",cubic,True


## 1) Setup
read a tiny CSV and map it into RDF using `rdflib`.


In [2]:
# DATA = Path("../data/semiconductors_small.csv")   # CSV lives in the repo
# TTL_OUT = Path("../data/semiconductors_small.ttl")# RDF Turtle output
# print("Using data:", DATA.resolve())

DATA    = Path("../data/full_dataset_Bandgap_0_to_5_featurized.csv")  # input CSV
TTL_OUT = Path("../data/full_dataset_Bandgap_0_to_5.ttl")             # RDF Turtle output

print("Using data:", DATA.resolve())
print("Will write TTL:", TTL_OUT.resolve())

Using data: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5_featurized.csv
Will write TTL: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl



## 2) Ontology skeleton (small and pragmatic)
a few classes and properties that I actually need for the initial CSV i am using.  
Classes: `Material`, `SynthesisMethod`, `CrystalStructure`, `Property`  
Properties:  
- data: `hasBandGap` (float eV), `hasLatticeConstant` (float Å)  
- object: `hasCrystalStructure`, `synthesizedBy`

- the final dataset will have: structure (crystal system and inversion center), composition and bandgap.  

3 class declarations (Material, CrystalStructure, Property) → 3  
4 property declarations (hasBandGap, hasFormula, hasExternalId, hasCrystalStructure) → 5  
5×(domain + range) annotations → 10 for each prperty    
Total = 3 + 5 + 10 = 18 triples.

In [3]:
# Ontology skeleton for: material_id, formula, band_gap, crystal_system, is_centrosymmetric
# (structure captured via crystal system + inversion center only)

from rdflib import Graph, Namespace, RDF, RDFS, XSD

g = Graph()

# Namespaces
EX = Namespace("http://example.org/mse#")
g.bind("ex", EX)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)

# Classes
Material         = EX.Material
CrystalStructure = EX.CrystalStructure
Property         = EX.Property  # generic placeholder

for cls in [Material, CrystalStructure, Property]:
    g.add((cls, RDF.type, RDFS.Class))

# Datatype properties (aligned to featurized CSV)
hasExternalId     = EX.hasExternalId        # -> material_id
hasFormula        = EX.hasFormula           # -> formula (composition string)
hasBandGap        = EX.hasBandGap           # -> band_gap (eV)
hasCrystalSystem  = EX.hasCrystalSystem     # -> crystal_system (e.g., cubic, hexagonal)
hasCentrosymmetric= EX.hasCentrosymmetric   # -> is_centrosymmetric (True/False)

for prop in [hasExternalId, hasFormula, hasBandGap, hasCrystalSystem, hasCentrosymmetric]:
    g.add((prop, RDF.type, RDF.Property))

# Domain/Range annotations
g.add((hasExternalId,      RDFS.domain, Material)); g.add((hasExternalId,      RDFS.range, XSD.string))
g.add((hasFormula,         RDFS.domain, Material)); g.add((hasFormula,         RDFS.range, XSD.string))
g.add((hasBandGap,         RDFS.domain, Material)); g.add((hasBandGap,         RDFS.range, XSD.float))
g.add((hasCrystalSystem,   RDFS.domain, Material)); g.add((hasCrystalSystem,   RDFS.range, XSD.string))
g.add((hasCentrosymmetric, RDFS.domain, Material)); g.add((hasCentrosymmetric, RDFS.range, XSD.boolean))

print("Ontology initialized (structure via crystal_system + inversion center, plus composition & bandgap).")
print("Triples so far:", len(g))

Ontology initialized (structure via crystal_system + inversion center, plus composition & bandgap).
Triples so far: 18


## 3) Load CSV and mint entities  
create IRIs (Internationalized Resource Identifier) from labels (simple normalization) and assert triples for each row.

In [4]:
# --- 1) Read CSV and canonicalize headers for THIS project (no synthesis, no lattice const) ---
# Featurized CSV columns: material_id, formula, band_gap, crystal_system, is_centrosymmetric
import pandas as pd, re
from rdflib import Literal, RDF, RDFS, XSD, URIRef

# load
df_raw = pd.read_csv(DATA)

# rename to stable names my code will use everywhere
df = df_raw.rename(columns={
    "material_id":        "material_id",
    "formula":            "formula",
    "band_gap":           "band_gap_eV",
    "crystal_system":     "crystal_system",
    "is_centrosymmetric": "is_centrosymmetric",
}).copy()

# create a human label for the Material (use formula as the label)
df["material_label"] = df["formula"]

# enforce dtypes
for col in ["material_label", "formula", "material_id", "crystal_system"]:
    if col in df.columns:
        df[col] = df[col].astype("string")
df["band_gap_eV"] = pd.to_numeric(df.get("band_gap_eV"), errors="coerce")
if "is_centrosymmetric" in df.columns:
    # normalize truthy strings (e.g., "True"/"FALSE"/"1"/"0")
    df["is_centrosymmetric"] = df["is_centrosymmetric"].map(
        lambda x: bool(int(x)) if str(x).strip() in {"1","0"} else
                  (str(x).strip().lower() == "true") if pd.notna(x) else None
    )

# --- 2) Helpers (consistent across the notebook) ---
def _slugify(text: str) -> str:
    text = str(text).strip().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
    text = re.sub(r"[^A-Za-z0-9_]", "_", text)
    return text

def mint_entity(label, cls: URIRef, fallback_prefix: str, idx: int):
    if label is None or (pd.isna(label) if hasattr(pd, "isna") else label is None) or str(label).strip() == "":
        safe = f"{fallback_prefix}_{idx}"
        iri  = EX[safe]
        g.add((iri, RDF.type, cls))
        g.add((iri, RDFS.label, Literal(f"{fallback_prefix} #{idx}")))
        return iri
    label_str = str(label)
    safe = _slugify(label_str)
    iri  = EX[safe]
    g.add((iri, RDF.type, cls))
    g.add((iri, RDFS.label, Literal(label_str)))
    return iri

print("Data loaded. Rows:", len(df))
print("Columns:", list(df.columns))

Data loaded. Rows: 150987
Columns: ['material_id', 'formula', 'band_gap_eV', 'structure', 'crystal_system', 'is_centrosymmetric', 'material_label']


## 4) Serialize to Turtle
write the RDF graph to a `.ttl` file so it’s versionable in Git and easy to inspect.


In [5]:
# --- Serialize graph to Turtle ---
# encoding="utf-8" ensures rdflib returns bytes
ttl_bytes = g.serialize(format="turtle", encoding="utf-8")
TTL_OUT.write_bytes(ttl_bytes)

# quick sanity check
p = TTL_OUT.resolve()
print("Wrote:", p)
print("Triples in graph:", len(g))
print("File size (bytes):", p.stat().st_size)

Wrote: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl
Triples in graph: 18
File size (bytes): 768



## 5) SPARQL queries (quick checks)
query the in‑memory graph via `rdflib` to verify the ontology + data mapping.


In [41]:
# Q1) Materials with band gap > 1 eV (descending)
q1 = """
PREFIX ex: <http://example.org/mse#>
SELECT ?formula ?id ?Eg
WHERE {
  ?m a ex:Material ;
     rdfs:label ?formula ;
     ex:hasBandGap ?Eg .
  OPTIONAL { ?m ex:hasExternalId ?id }
  FILTER(?Eg > 1.0)
}
ORDER BY DESC(?Eg)
LIMIT 20
"""
for row in g.query(q1, initNs={"rdfs": RDFS}):
    print(row)

In [7]:
# Q2) Materials synthesized by MOCVD
q2 = """PREFIX ex: <http://example.org/mse#>
SELECT ?material
WHERE {
  ?m a ex:Material ;
     rdfs:label ?material ;
     ex:synthesizedBy ?meth .
  ?meth rdfs:label "MOCVD" .
}
"""
for row in g.query(q2, initNs={"rdfs": RDFS}):
    print(row)

In [None]:
# Q3) Materials with diamond cubic structure
q3 = """PREFIX ex: <http://example.org/mse#>
SELECT ?material
WHERE {
  ?m a ex:Material ;
     rdfs:label ?material ;
     ex:hasCrystalStructure ?cs .
  ?cs rdfs:label "Diamond cubic" .
}
"""
for row in g.query(q3, initNs={"rdfs": RDFS}):
    print(row)

## 6) Creating safeguard for possible problems in data scrapping!!
keep a few small rules here to catch obvious issues (labels missing, negative band gaps, etc.).

In [6]:
problems = []

# A) All Materials should have labels
for s in g.subjects(RDF.type, EX.Material):
    if not any(True for _ in g.objects(s, RDFS.label)):
        problems.append(f"Material without label: {s}")

# B) Band gap must be numeric and non-negative
for s, p, o in g.triples((None, EX.hasBandGap, None)):
    try:
        val = float(o.toPython())
        if val < 0:
            problems.append(f"Negative band gap for {s}")
    except Exception:
        problems.append(f"Non-numeric band gap for {s}: {o}")

print("No obvious problems ✅" if not problems else "Consistency problems:")
for x in problems:
    print("-", x)

No obvious problems ✅



## 7) Placeholder for LLM‑assisted extraction
When I replace this stub with a real LLM/NLP call, I’ll feed abstracts/tables and get back candidate triples to add to the graph. This cell is just a demonstration of tideousness one has to encounter for manually adding to the KG :xD.


In [7]:
def propose_triples_from_text(text: str):
    """
    Demo placeholder:
    Pretend we parsed that 'GaN has Eg ~3.4 eV'.
    Later this can be swapped for NLP/LLM-based extraction.
    """
    return [(EX.GaN, EX.hasBandGap, Literal(3.4, datatype=XSD.float))]

# Insert demo triples
for s, p, o in propose_triples_from_text("GaN has band gap ~3.4 eV"):
    g.add((s, p, o))

print("Triples after stub insert:", len(g))

Triples after stub insert: 19



## 8) Save again after updates
I keep the TTL in sync with the in‑memory graph.


In [8]:
# --- Update the TTL file with new triples ---
TTL_OUT.write_bytes(g.serialize(format="turtle", encoding="utf-8"))

print("Updated:", TTL_OUT.resolve())
print("Triples now in graph:", len(g))

Updated: E:\Projects\Semantic_models_for-MSE\data\full_dataset_Bandgap_0_to_5.ttl
Triples now in graph: 19


## 9: Plotting a knowledge graph

*for the tiny (n=5) dataset*

In [None]:
# from pyvis.network import Network
# from rdflib import Graph, URIRef, BNode, Literal


# def display_label(term):
#     """Human label for a node/edge: prefer rdfs:label, then QName, then short str."""
#     lab = g.value(term, RDFS.label)
#     if lab:
#         return str(lab).strip()
#     # QName only for URIRefs
#     if isinstance(term, URIRef):
#         try:
#             return g.namespace_manager.normalizeUri(term)
#         except Exception:
#             pass
#     s = str(term).strip()
#     return s

# def node_id(term):
#     """Stable ID for pyvis (avoid raw labels)."""
#     # Use actual URI for URIRefs/BNodes; fallback to hash
#     if isinstance(term, (URIRef, BNode)):
#         return str(term)
#     return f"lit:{hash((str(term), type(term).__name__))}"

# def node_style(term):
#     """Color by class where possible (Material / CrystalStructure / SynthesisMethod)."""
#     # Try to infer from rdf:type (lightweight check)
#     types = set(g.objects(term, RDF.type)) if isinstance(term, (URIRef, BNode)) else set()
#     # Resolve EX namespace if present
#     ns = dict(g.namespace_manager.namespaces())
#     EX = ns.get("ex")
#     def is_type(tname):
#         return any(str(t).endswith(f"#{tname}") or str(t).endswith(f"/{tname}") for t in types)
#     if is_type("Material"):
#         return dict(color="#2b8a3e", shape="ellipse")
#     if is_type("CrystalStructure"):
#         return dict(color="#1c7ed6", shape="ellipse")
#     if is_type("SynthesisMethod"):
#         return dict(color="#e8590c", shape="ellipse")
#     if isinstance(term, Literal):
#         return dict(color="#bfbfbf", shape="box")
#     return dict(color="#666666", shape="ellipse")

# def visualize_graph_pyvis(g, max_edges=1500, show_literals=False, height="700px"):
#     net = Network(height=height, width="100%", directed=True, notebook=True,
#                   cdn_resources="in_line")  # avoid the Jupyter warning
#     net.toggle_physics(True)

#     added = set()
#     edge_count = 0

#     for s, p, o in g.triples((None, None, None)):
#         if edge_count >= max_edges:
#             break

#         # skip literal nodes unless asked
#         if not show_literals and isinstance(o, Literal):
#             # still add edge to a small boxed literal if you like:
#             # continue
#             pass

#         sid = node_id(s); so = node_id(o)
#         if sid not in added:
#             net.add_node(sid, label=display_label(s), **node_style(s))
#             added.add(sid)
#         if show_literals or not isinstance(o, Literal):
#             if so not in added:
#                 net.add_node(so, label=display_label(o), **node_style(o))
#                 added.add(so)

#         net.add_edge(sid, so, label=display_label(p))
#         edge_count += 1

#     net.show("kg.html")
#     print("Wrote: kg.html (open this file in your browser)")

# visualize_graph_pyvis(g, max_edges=1000, show_literals=True)

kg.html
Wrote: kg.html (open this file in your browser)


*For the mostly semiconductor (0-5 ev) large (>15k) dataset*  
- via `HTML`

In [9]:
# Lightweight KG visualization adapted to  ontology (Material, CrystalStructure, bandgap, crystal_system, is_centrosymmetric)
from pyvis.network import Network
from rdflib import Graph, URIRef, BNode, Literal, RDF, RDFS

def display_label(term):
    """Prefer rdfs:label, then QName/localname, else str(term)."""
    lab = g.value(term, RDFS.label)
    if lab:
        return str(lab).strip()
    if isinstance(term, URIRef):
        try:
            return g.namespace_manager.normalizeUri(term)
        except Exception:
            pass
        s = str(term)
        return s.rsplit("#", 1)[-1].rsplit("/", 1)[-1]
    return str(term).strip()

def node_id(term):
    """Stable ID for pyvis."""
    if isinstance(term, (URIRef, BNode)):
        return str(term)
    return f"lit:{hash((str(term), type(term).__name__))}"

def node_style(term):
    """Color by class (Material / CrystalStructure); literals boxed."""
    types = set(g.objects(term, RDF.type)) if isinstance(term, (URIRef, BNode)) else set()

    def has_type(local):
        return any(str(t).endswith(f"#{local}") or str(t).endswith(f"/{local}") for t in types)

    if has_type("Material"):
        return dict(color="#2b8a3e", shape="ellipse")
    if has_type("CrystalStructure"):
        return dict(color="#1c7ed6", shape="ellipse")
    if isinstance(term, Literal):
        return dict(color="#bfbfbf", shape="box")
    return dict(color="#666666", shape="ellipse")

def visualize_graph_pyvis(g, max_edges=2000, show_literals=True, height="700px", out_html="kg.html"):
    net = Network(height=height, width="100%", directed=True, notebook=True, cdn_resources="in_line")
    net.toggle_physics(True)

    added = set()
    edge_count = 0

    for s, p, o in g.triples((None, None, None)):
        if edge_count >= max_edges:
            break

        # Optionally skip literals
        if not show_literals and isinstance(o, Literal):
            continue

        sid = node_id(s)
        oid = node_id(o)

        if sid not in added:
            net.add_node(sid, label=display_label(s), **node_style(s))
            added.add(sid)
        if show_literals or not isinstance(o, Literal):
            if oid not in added:
                net.add_node(oid, label=display_label(o), **node_style(o))
                added.add(oid)

        # Shorten common predicate labels for readability
        pred_label = display_label(p)
        pred_label = pred_label.replace("ex:", "")
        pred_label = pred_label.replace("hasCrystalSystem", "crystal_system")
        pred_label = pred_label.replace("hasCentrosymmetric", "centrosymmetric")
        pred_label = pred_label.replace("hasBandGap", "band_gap_eV")
        pred_label = pred_label.replace("hasFormula", "formula")
        pred_label = pred_label.replace("hasExternalId", "material_id")

        net.add_edge(sid, oid, label=pred_label)
        edge_count += 1

    net.show(out_html)
    print(f"Wrote: {out_html} (open in your browser)")

visualize_graph_pyvis(g, max_edges=1000, show_literals=True, out_html="kg.html")

kg.html
Wrote: kg.html (open in your browser)


- Via `Graphviz`

In [21]:
import os
os.environ["PATH"] += os.pathsep + r"C:\Program Files\Graphviz\bin" 
from graphviz import Digraph
import hashlib

def nlabel(t):
    lab = g.value(t, RDFS.label)
    if lab: return str(lab)
    if isinstance(t, URIRef):
        try: return g.namespace_manager.normalizeUri(t)
        except: pass
    return str(t)

def nid(t):
    return hashlib.md5(str(t).encode("utf-8")).hexdigest()  # safe ID for DOT

def nshape(t):
    return "box" if isinstance(t, Literal) else "ellipse"

def ncolor(t):
    types = set(g.objects(t, RDF.type)) if isinstance(t, URIRef) else set()
    def has(local): return any(str(tt).endswith(f"#{local}") or str(tt).endswith(f"/{local}") for tt in types)
    if has("Material"): return "#2b8a3e"
    if has("CrystalStructure"): return "#1c7ed6"
    return "#666666"

dot = Digraph(engine="dot")  # or "sfdp" for large graphs
dot.attr(rankdir="LR")

seen = set()
for s, p, o in g.triples((None, None, None)):
    sid, oid = nid(s), nid(o)
    if sid not in seen:
        dot.node(sid, label=nlabel(s), shape=nshape(s), color=ncolor(s)); seen.add(sid)
    if oid not in seen:
        dot.node(oid, label=nlabel(o), shape=nshape(o), color=ncolor(o)); seen.add(oid)
    dot.edge(sid, oid, label=nlabel(p).replace("ex:", ""))

dot.render("kg_graphviz", format="png", cleanup=True)
print("Wrote: kg_graphviz.png")

Wrote: kg_graphviz.png


# Utilizing `LLM`  in `KG pipeline`  
The KG is like a well-organized library: every material is a “book,” and the properties (band gap, crystal system, centrosymmetry, formula) are the “catalog cards.”  

The LLM is like a librarian who understands natural language: I can ask, “Which semiconductors are cubic and non-centrosymmetric with band gap > 2 eV?” and the LLM can translate that into graph queries or even propose new triples. Without the LLM, I’d have to speak in SPARQL (machine query language). With the LLM, I can speak in plain English, and it reformulates my request into the right graph operations.  

Another role: the LLM can ingest external text (papers, reports) and suggest new triples to insert, like the librarian reading new books and updating the catalog automatically.  

`Net effect:`
KG = structured factual memory (precise, but rigid). LLM = flexible reasoning & translation layer (imprecise, but good at language). Together = I get both rigor and flexibility.


*Quick smoke test*
-Chat GPT: doesnt support api access for plus user.

In [13]:
# from openai import OpenAI
# import os, json

# assert os.getenv("OPENAI_API_KEY"), "API key missing"
# client = OpenAI()

# resp = client.chat.completions.create(
#     model="gpt-4o-mini",
#     response_format={"type": "json_object"},   # ← JSON mode
#     messages=[
#         {"role":"system","content":"Return valid JSON only."},
#         {"role":"user","content":"Respond with {\"ok\": true}"}
#     ],
#     temperature=0
# )
# print(json.loads(resp.choices[0].message.content))

*Quick smoke test*  
-OLLAMA offers llms to be run on local machines and an API.

In [None]:
# import json, ollama
# r = ollama.chat(
#     model='llama3.2:3b',
#     messages=[{"role":"system","content":"Return ONLY valid JSON."},
#               {"role":"user","content":'{"ok": true}'}],
#     format='json',
#     options={'temperature': 0}
# )
# print(json.loads(r['message']['content']))

{'ok': True}


**Wiring 'ollama'into the pipeline.**  
a. `ingestion` via OLLAMA  
1. Schema

In [10]:
# Pydantic schema for LLM ↔ KG handoff (natural language → structured facts)
from pydantic import BaseModel, Field
from typing import Optional

class RowOut(BaseModel):
    # Human-readable material label (e.g., formula); becomes rdfs:label on EX.Material
    material: str

    # Columns we actually model in this project
    formula: Optional[str] = None                 # -> ex:hasFormula
    material_id: Optional[str] = None             # -> ex:hasExternalId
    crystal_system: Optional[str] = None          # -> ex:hasCrystalSystem
    is_centrosymmetric: Optional[bool] = None     # -> ex:hasCentrosymmetric

    # Numeric property (kept), no synthesis/lattice here
    band_gap_eV: Optional[float] = Field(default=None, ge=0)  # -> ex:hasBandGap (eV)

2. Building 'hints' from data

In [11]:
# (LLM role: use these sets to validate/normalize outputs before making triples)

allowed_crystal_systems = sorted(set(str(x).strip() for x in df['crystal_system'].dropna()))
allowed_centrosym       = sorted(set(bool(x) for x in df['is_centrosymmetric'].dropna()))

## aliases for my old schema:
# allowed_structs = allowed_crystal_systems  
# allowed_methods = []    
                   
print("crystal_systems:", allowed_crystal_systems)
print("centrosymmetric values:", allowed_centrosym)

crystal_systems: ['cubic', 'hexagonal', 'monoclinic', 'orthorhombic', 'tetragonal', 'triclinic', 'trigonal']
centrosymmetric values: [False, True]


3. Normalization of data-row (that're gonna be parsed) with 'ollama'.  
`Pydantic:` Python's most popular data validation library that can turn type hints into runtime validation rules. Instead of writing dozens of if isinstance() checks and custom validation functions, you define your data structure once using familiar Python syntax

In [12]:
# LLM role: normalize free-text → canonical KG schema (RAM-friendly w/ fallbacks)
import json, ollama
from pydantic import ValidationError

# try tiny/quantized first; pull if missing: `ollama pull llama3.2:1b`
OLLAMA_MODEL_CANDIDATES = [
    'llama3.2:1b',
    'llama3.2:1b-instruct',
    'llama3.2:3b-instruct-q4_0',
    'llama3.2:3b-q4_0',
    'llama3.2:3b',  # last resort
]

OLLAMA_OPTIONS = {
    'temperature': 0,
    'num_ctx': 512,   # shrink context to save RAM
    'num_batch': 16,  # smaller batches
    # 'num_gpu': 0,   # uncomment to force CPU if VRAM is tight
}

SYSTEM = f"""
You are a materials KG assistant.
Return ONLY JSON with keys EXACTLY:
material, formula, material_id, crystal_system, is_centrosymmetric, band_gap_eV.

Normalization rules:
- crystal_system → pick from this list when possible: {allowed_crystal_systems}
- is_centrosymmetric → boolean true/false (accept yes/no/1/0/centro/non-centro)
- band_gap_eV → float in eV; use null if unknown
- formula/material_id may be null if unknown
- Do not invent values; prefer null over guesses
"""

def _to_bool(x):
    if x is None: return None
    s = str(x).strip().lower()
    if s in {"true","yes","y","1","t"}: return True
    if s in {"false","no","n","0","f"}: return False
    return None

def _pick_model():
    for m in OLLAMA_MODEL_CANDIDATES:
        try:
            # cheap ping to see if it loads in memory
            ollama.chat(model=m, messages=[{"role":"user","content":"ping"}], options={'num_ctx':128,'temperature':0})
            return m
        except Exception:
            continue
    raise RuntimeError("No suitable Ollama model available. Pull one of: " + ", ".join(OLLAMA_MODEL_CANDIDATES))

_MODEL = _pick_model()
print("Using Ollama model:", _MODEL)

def normalize_row_with_ollama(row: dict) -> RowOut:
    msg = f"Normalize this row to the schema: {row}"
    tried = set()
    model = _MODEL
    while True:
        try:
            resp = ollama.chat(
                model=model,
                messages=[{"role":"system","content": SYSTEM},
                          {"role":"user","content": msg}],
                format='json',
                options=OLLAMA_OPTIONS
            )
            data = json.loads(resp['message']['content'])
            if "is_centrosymmetric" in data:
                data["is_centrosymmetric"] = _to_bool(data["is_centrosymmetric"])
            return RowOut(**data)
        except ValidationError:
            # minimal, safe fallback coercions
            return RowOut(
                material=str(data.get("material","")).strip() or "Unknown",
                formula=(str(data["formula"]).strip() if data.get("formula") not in (None,"") else None),
                material_id=(str(data["material_id"]).strip() if data.get("material_id") not in (None,"") else None),
                crystal_system=(str(data["crystal_system"]).strip() if data.get("crystal_system") not in (None,"") else None),
                is_centrosymmetric=_to_bool(data.get("is_centrosymmetric")),
                band_gap_eV=(float(data["band_gap_eV"]) if str(data.get("band_gap_eV","")).strip() not in {"", "None"} else None),
            )
        except Exception:
            tried.add(model)
            # try next lighter/quantized model
            model = next((m for m in OLLAMA_MODEL_CANDIDATES if m not in tried), None)
            if not model:
                raise

Using Ollama model: llama3.2:3b


4. Appending normalized result into my graph.  
*safeguard for dedupe, provenance, and idempotency are wired.*    
- add two provenance props (ex:statedIn, ex:hasProvenanceId) to the ontology.  
- build in-memory indices of existing materials by material_id and by formula (for dedupe).  
- add_once(s,p,o): guard so the same triple isn’t added twice.  
- get_or_create_material(...): reuse an existing material by material_id (first) or formula (fallback); otherwise mint a new one and register it in the indices.   
- small cache for provenance nodes so the same (label,id) source isn’t duplicated.  
- ingest_normalized_row(...): idempotently attach hasFormula, hasExternalId, hasBandGap, hasCrystalSystem, hasCentrosymmetric; optionally link a provenance node via ex:statedIn; return the material IRI.

In [13]:
# LLM → KG: ingest a normalized row (idempotent, dedupe, provenance)

from rdflib import Literal, XSD, BNode, RDF, RDFS
from typing import Optional  # For python 3.10 users: replace str|None with Optional[str]

# --- 0) Ontology add-ons (once) ---
statedIn        = EX.statedIn
hasProvenanceId = EX.hasProvenanceId
for prop in [statedIn, hasProvenanceId]:
    g.add((prop, RDF.type, RDF.Property))
    g.add((prop, RDFS.domain, RDFS.Resource))
    g.add((prop, RDFS.range,  RDFS.Resource))

# --- 1) Fast lookup indices for dedupe by material_id / formula ---
def _index_materials():
    by_mid, by_formula = {}, {}
    for m in g.subjects(RDF.type, Material):
        mid = g.value(m, hasExternalId)
        if mid:   by_mid[str(mid)] = m
        form = g.value(m, hasFormula)
        if form:  by_formula[str(form)] = m
    return by_mid, by_formula

MAT_BY_ID, MAT_BY_FORMULA = _index_materials()

# --- 2) Idempotent triple adder ---
def add_once(s, p, o):
    if (s, p, o) not in g:
        g.add((s, p, o))

# --- 3) Get-or-create material with dedupe rules ---
def get_or_create_material(material_id: Optional[str], formula: Optional[str], label: Optional[str], idx: int):
    # Prefer material_id, else formula
    if material_id and material_id in MAT_BY_ID:
        return MAT_BY_ID[material_id]
    if formula and formula in MAT_BY_FORMULA:
        return MAT_BY_FORMULA[formula]
    # Mint new
    m = mint_entity(label or formula or material_id, Material, "Material", idx)
    if material_id:
        add_once(m, hasExternalId, Literal(material_id, datatype=XSD.string))
        MAT_BY_ID[material_id] = m
    if formula:
        add_once(m, hasFormula, Literal(formula, datatype=XSD.string))
        MAT_BY_FORMULA[formula] = m
    return m

# --- 4) Provenance node helper with small cache to avoid duplicates ---
_SOURCE_CACHE = {}  # key: (label, id) -> BNode

def make_source_node(source_label: Optional[str] = None, source_id: Optional[str] = None):
    key = (source_label or "", source_id or "")
    if key in _SOURCE_CACHE:
        return _SOURCE_CACHE[key]
    src = BNode()
    if source_label:
        add_once(src, RDFS.label, Literal(source_label))
    if source_id:
        add_once(src, hasProvenanceId, Literal(source_id))
    _SOURCE_CACHE[key] = src
    return src

# --- 5) Ingest (idempotent + dedupe + provenance) ---
def ingest_normalized_row(nr: RowOut, idx: int = 0,
                          source_label: Optional[str] = None,
                          source_id: Optional[str] = None):
    m = get_or_create_material(
        material_id = getattr(nr, "material_id", None),
        formula     = getattr(nr, "formula", None),
        label       = nr.material,
        idx         = idx
    )

    if getattr(nr, "formula", None):
        add_once(m, hasFormula, Literal(str(nr.formula), datatype=XSD.string))
    if getattr(nr, "material_id", None):
        add_once(m, hasExternalId, Literal(str(nr.material_id), datatype=XSD.string))
    if nr.band_gap_eV is not None:
        add_once(m, hasBandGap, Literal(float(nr.band_gap_eV), datatype=XSD.float))
    if getattr(nr, "crystal_system", None):
        add_once(m, hasCrystalSystem, Literal(str(nr.crystal_system), datatype=XSD.string))
    if getattr(nr, "is_centrosymmetric", None) is not None:
        add_once(m, hasCentrosymmetric, Literal(bool(nr.is_centrosymmetric), datatype=XSD.boolean))

    if source_label or source_id:
        src = make_source_node(source_label, source_id)
        add_once(m, statedIn, src)

    return m

b. `Query via OLLAMA`  
**NL → SPARQL → rdflib query → DataFrame**

In [16]:
# NL → SPARQL with safety rails + sanitizer
import json, ollama, pandas as pd

SPARQL_PREFIX = """PREFIX ex: <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
"""

NL2SPARQL_SYSTEM = f"""
Generate SPARQL SELECT for this ontology:

Class:
  ex:Material

Properties on ex:Material (all literals):
  ex:hasFormula (xsd:string)         -> ?formula
  ex:hasExternalId (xsd:string)
  ex:hasBandGap (xsd:float)          -> ?bandgap
  ex:hasCrystalSystem (xsd:string)   -> ?crystal_system
  ex:hasCentrosymmetric (xsd:boolean)-> ?centro

Rules (IMPORTANT):
- Output ONLY a SPARQL SELECT query.
- Include the PREFIX block exactly as given by the user.
- Always BIND variables before filtering them.
- DO NOT use 'NOT'. To require non-centrosymmetric, bind and filter: ?m ex:hasCentrosymmetric ?centro . FILTER(?centro = false)
- If filtering band gap, ensure ?bandgap is bound (use OPTIONAL if the question doesn't require it).
- Default projection (if unspecified): ?m ?formula ?bandgap ?crystal_system ?centro
"""

def nl_to_sparql(question: str, model='llama3.2:1b'):
    prompt = f"""{SPARQL_PREFIX}
# Question:
{question}
# Write a valid SPARQL SELECT:"""
    resp = ollama.chat(
        model=model,
        messages=[{"role":"system","content": NL2SPARQL_SYSTEM},
                  {"role":"user","content": prompt}],
        options={'temperature': 0, 'num_ctx': 512}
    )
    q = resp['message']['content'].strip()
    if q.startswith("```"):
        q = q.strip("`").split("\n",1)[1]
    return sanitize_sparql(q)

def sanitize_sparql(q: str) -> str:
    # Ensure PREFIX present
    if "PREFIX ex:" not in q:
        q = SPARQL_PREFIX + "\n" + q

    # Replace bad 'NOT ?m ex:hasCentrosymmetric .' usages
    q = q.replace("NOT ?m ex:hasCentrosymmetric .", "?m ex:hasCentrosymmetric ?centro . FILTER(?centro = false)")
    q = q.replace("NOT (?m ex:hasCentrosymmetric)", "?m ex:hasCentrosymmetric ?centro . FILTER(?centro = false)")

    # Ensure bindings exist before filters
    where_lower = q.lower()
    needs_bandgap = "bandgap" in where_lower and "ex:hasbandgap" not in where_lower
    if needs_bandgap:
        # insert OPTIONAL binding for bandgap just before the closing '}'
        q = q.replace("WHERE {", "WHERE {\n  OPTIONAL { ?m ex:hasBandGap ?bandgap }")

    needs_centro = ("centro" in where_lower) and ("ex:hascentrosymmetric" not in where_lower)
    if needs_centro:
        q = q.replace("WHERE {", "WHERE {\n  ?m ex:hasCentrosymmetric ?centro .")

    # Ensure projection columns if none are provided
    if "SELECT" in q and "*" in q.split("WHERE")[0]:
        q = q.replace("SELECT *", "SELECT ?m ?formula ?bandgap ?crystal_system ?centro")

    # Always OPTIONAL-bind formula/crystal_system for nicer outputs
    if "ex:hasFormula" not in q:
        q = q.replace("WHERE {", "WHERE {\n  OPTIONAL { ?m ex:hasFormula ?formula }")
    if "ex:hasCrystalSystem" not in q:
        q = q.replace("WHERE {", "WHERE {\n  OPTIONAL { ?m ex:hasCrystalSystem ?crystal_system }")

    return q

def run_sparql(query: str):
    qres = g.query(query)
    cols = [str(v) for v in qres.vars]
    rows = [{str(k): (str(v) if v is not None else None) for k,v in zip(cols, r)} for r in qres]
    return pd.DataFrame(rows, columns=cols)

def ask_kg(question: str, model=None):
    # Reuse the model chosen in the normalization cell; or allow override
    model = model or _MODEL
    sparql = nl_to_sparql(question, model=model)
    print("SPARQL:\n", sparql)
    return run_sparql(sparql)

**Pre-Scale-up sanity-check.**  
Demo: add a few new entries from text, then query

In [17]:
# --- New knowledge from text (fabricated) → RowOut via LLM → ingest with provenance ---
texts = [
    "β-Ga2O3 has a band gap around 4.8 eV and is monoclinic; it's centrosymmetric.",
    "InP (zinc blende) band gap ~1.34 eV; non-centrosymmetric.",
    "ZnO wurtzite, band gap ≈ 3.3 eV; non-centrosymmetric.",
]

for k, t in enumerate(texts, 1):
    nr = normalize_row_with_ollama({"text": t})                             # LLM extracts to the schema
    ingest_normalized_row(nr, idx=100000+k,                                 # large idx to avoid collisions
                          source_label="fabricated_demo",                   # provenance
                          source_id=f"demo:{k}")

print("Triples after new text ingestion:", len(g))

# NL query via LLM → SPARQL
ask_kg("List non-centrosymmetric semiconductors with band gap > 3 eV, show formula and crystal system.")

Triples after new text ingestion: 48
SPARQL:
 PREFIX ex: <http://example.org/mse#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?m ?formula ?bandgap ?crystal_system 
WHERE {
  ?m ex:hasCentrosymmetric ?centro .
  FILTER(?centro = false)
  ?m ex:hasFormula ?formula .
  ?m ex:hasBandGap ?bandgap .
  ?m ex:hasCrystalSystem ?crystal_system .
}



Unnamed: 0,m,formula,bandgap,crystal_system


*Ingest a new CSV (dedupe by material_id→formula)*  
**Usage:**  
*ingest_new_csv("../data/new_semiconductors.csv", update_df=False)  # KG only*  
*ingest_new_csv("../data/new_semiconductors.csv", update_df=True)   # also append to df*

In [None]:
import pandas as pd

def _norm_str(x):
    return str(x).strip() if pd.notna(x) and str(x).strip() not in {"", "nan", "None"} else None

def ingest_new_csv(csv_path: str, update_df: bool = False):
    new_df = pd.read_csv(csv_path)

    # align columns if needed
    rename_map = {
        "band_gap": "band_gap_eV",
        "crystalsystem": "crystal_system",
        "centrosymmetric": "is_centrosymmetric",
    }
    new_df = new_df.rename(columns=rename_map)

    added = 0
    for i, r in new_df.iterrows():
        nr = RowOut(
            material          = _norm_str(r.get("formula")) or _norm_str(r.get("material_id")) or f"Material_new_{i}",
            formula           = _norm_str(r.get("formula")),
            material_id       = _norm_str(r.get("material_id")),
            crystal_system    = _norm_str(r.get("crystal_system")),
            is_centrosymmetric= (bool(r["is_centrosymmetric"]) if pd.notna(r.get("is_centrosymmetric")) else None),
            band_gap_eV       = (float(r["band_gap_eV"]) if pd.notna(r.get("band_gap_eV")) else None),
        )
        # idempotent + dedup-aware
        ingest_normalized_row(nr, idx=1_000_000 + i, source_label="csv_import", source_id=csv_path)
        added += 1

    if update_df:
        global df
        # naive append; optional real dedupe if you want:
        # df = pd.concat([df, new_df], ignore_index=True).drop_duplicates(subset=["material_id","formula"], keep="first")
        df = pd.concat([df, new_df], ignore_index=True)

    print(f"Ingested {added} rows from {csv_path}. Triples now: {len(g)}")
    return added

In [None]:
# Bulk ingest the whole DataFrame into the KG (no LLM; idempotent & dedup aware)

import math

def _get_str(x):
    return str(x).strip() if x is not None and str(x).strip() not in {"", "nan", "None"} else None

ingested = 0
for i, r in df.iterrows():
    nr = RowOut(
        material = _get_str(r.get("formula")) or _get_str(r.get("material_id")) or f"Material_{i}",
        formula  = _get_str(r.get("formula")),
        material_id = _get_str(r.get("material_id")),
        crystal_system = _get_str(r.get("crystal_system")),
        is_centrosymmetric = (bool(r["is_centrosymmetric"]) if pd.notna(r.get("is_centrosymmetric")) else None),
        band_gap_eV = (float(r["band_gap_eV"]) if pd.notna(r.get("band_gap_eV")) else None),
    )
    ingest_normalized_row(nr, idx=i)
    ingested += 1
    if ingested % 2000 == 0:
        print(f"Ingested {ingested} rows… Triples: {len(g)}")

print(f"Done. Ingested {ingested} rows total. Triples now: {len(g)}")