# Mini Materials Knowledge Graph — Common Semiconductors
**What I'm building:** a tiny pipeline that turns a small semiconductors table into **RDF triples**, then I query it using **SPARQL**. I keep the ontology minimal and readable.


## Some basic terminologies
- **Ontology** → my small schema + vocabulary for this domain (classes + relations).
- **RDF triple** → one fact written as `subject — predicate — object`.
- **Namespace** → URL prefix so my identifiers are unique.
- **SPARQL** → my query tool for RDF graphs (like SQL but for triples).
- **Turtle (.ttl)** → compact text format to store RDF.


**Imports**

In [6]:
import pandas as pd
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, XSD
from pathlib import Path
import re

## 1) Setup
read a tiny CSV and map it into RDF using `rdflib`.


In [2]:
DATA = Path("../data/semiconductors_small.csv")   # CSV lives in the repo
TTL_OUT = Path("../data/semiconductors_small.ttl")# RDF Turtle output
print("Using data:", DATA.resolve())

Using data: E:\Projects\Semantic_models_for-MSE\data\semiconductors_small.csv



## 2) Ontology skeleton (small and pragmatic)
a few classes and properties that I actually need for the initial CSV i am using.  
Classes: `Material`, `SynthesisMethod`, `CrystalStructure`, `Property`  
Properties:  
- data: `hasBandGap` (float eV), `hasLatticeConstant` (float Å)  
- object: `hasCrystalStructure`, `synthesizedBy`


In [3]:
g = Graph()

# Namespace for my identifiers (can switch to a real domain later)
EX = Namespace("http://example.org/mse#")
g.bind("ex", EX)
g.bind("rdfs", RDFS)
g.bind("xsd", XSD)

# Classes
Material         = EX.Material
SynthesisMethod  = EX.SynthesisMethod
CrystalStructure = EX.CrystalStructure
Property         = EX.Property

for cls in [Material, SynthesisMethod, CrystalStructure, Property]:
    g.add((cls, RDF.type, RDFS.Class))

# Properties
hasBandGap          = EX.hasBandGap
hasLatticeConstant  = EX.hasLatticeConstant
hasCrystalStructure = EX.hasCrystalStructure
synthesizedBy       = EX.synthesizedBy

for prop in [hasBandGap, hasLatticeConstant, hasCrystalStructure, synthesizedBy]:
    g.add((prop, RDF.type, RDF.Property))

# Light domain/range annotations (sanity helpers for later validation)
g.add((hasBandGap, RDFS.domain, Material));         g.add((hasBandGap, RDFS.range, XSD.float))
g.add((hasLatticeConstant, RDFS.domain, Material)); g.add((hasLatticeConstant, RDFS.range, XSD.float))
g.add((hasCrystalStructure, RDFS.domain, Material));g.add((hasCrystalStructure, RDFS.range, CrystalStructure))
g.add((synthesizedBy, RDFS.domain, Material));      g.add((synthesizedBy, RDFS.range, SynthesisMethod))

print("Ontology initialized. Triples so far:", len(g))

Ontology initialized. Triples so far: 16


## 3) Load CSV and mint entities  
create IRIs (Internationalized Resource Identifier) from labels (simple normalization) and assert triples for each row.

In [7]:
# --- 1) Read CSV with correct dtypes ---
# Only force the TEXT columns to string dtype. Let numeric columns be inferred as float.
STRING_COLS = ["material", "crystal_structure", "typical_synthesis"]
df = pd.read_csv(DATA, dtype={col: "string" for col in STRING_COLS})

# Ensure numeric columns are numeric (coerce bad cells to NaN gracefully)
df["band_gap_eV"] = pd.to_numeric(df.get("band_gap_eV"), errors="coerce")
df["lattice_const_A"] = pd.to_numeric(df.get("lattice_const_A"), errors="coerce")

# --- 2) Helpers ---
def _slugify(text: str) -> str:
    # normalize to an IRI-safe slug
    text = text.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")
    text = re.sub(r"[^A-Za-z0-9_]", "_", text)
    return text

def mint_entity(label, cls: URIRef, fallback_prefix: str, idx: int):
    """
    label may be pandas NA/None/float NaN or a proper string.
    If missing, mint a stable fallback IRI like ex:Material_42 and add a descriptive rdfs:label.
    """
    if label is None or pd.isna(label):
        safe = f"{fallback_prefix}_{idx}"
        iri = EX[safe]
        g.add((iri, RDF.type, cls))
        g.add((iri, RDFS.label, Literal(f"{fallback_prefix} #{idx}")))
        return iri

    label_str = str(label)
    safe = _slugify(label_str)
    iri = EX[safe]
    g.add((iri, RDF.type, cls))
    g.add((iri, RDFS.label, Literal(label_str)))
    return iri

# --- 3) Ingest rows ---
for i, row in df.iterrows():
    mat = mint_entity(row.get("material"),          Material,         "Material",         i)
    cs  = mint_entity(row.get("crystal_structure"), CrystalStructure, "CrystalStructure", i)
    sm  = mint_entity(row.get("typical_synthesis"), SynthesisMethod,  "SynthesisMethod",  i)

    # Data properties (numbers): add if present
    if pd.notna(row.get("band_gap_eV")):
        g.add((mat, hasBandGap, Literal(float(row["band_gap_eV"]), datatype=XSD.float)))
    if pd.notna(row.get("lattice_const_A")):
        g.add((mat, hasLatticeConstant, Literal(float(row["lattice_const_A"]), datatype=XSD.float)))

    # Object properties (links)
    g.add((mat, hasCrystalStructure, cs))
    g.add((mat, synthesizedBy, sm))

print("After ingest: triples =", len(g))

After ingest: triples = 76


## 4) Serialize to Turtle
write the RDF graph to a `.ttl` file so it’s versionable in Git and easy to inspect.


In [9]:
ttl_bytes = g.serialize(format="turtle", encoding="utf-8")  # returns bytes when encoding is set, else str
TTL_OUT.write_bytes(ttl_bytes)
print("Wrote:", TTL_OUT.resolve())

Wrote: E:\Projects\Semantic_models_for-MSE\data\semiconductors_small.ttl



## 5) SPARQL queries (quick checks)
query the in‑memory graph via `rdflib` to verify the ontology + data mapping.


In [None]:
# Q1) Materials with band gap > 1 eV (descending)
q1 = """PREFIX ex: <http://example.org/mse#>
SELECT ?material ?Eg
WHERE {
  ?m a ex:Material ;
     rdfs:label ?material ;
     ex:hasBandGap ?Eg .
  FILTER(?Eg > 1.0)
}
ORDER BY DESC(?Eg)
"""
for row in g.query(q1, initNs={"rdfs": RDFS}):
    print(row)

(rdflib.term.Literal('Gallium Nitride'), rdflib.term.Literal('3.4', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))
(rdflib.term.Literal('Silicon Carbide (4H)'), rdflib.term.Literal('3.26', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))
(rdflib.term.Literal('Gallium Arsenide'), rdflib.term.Literal('1.42', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))
(rdflib.term.Literal('Indium Phosphide'), rdflib.term.Literal('1.34', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))
(rdflib.term.Literal('Silicon'), rdflib.term.Literal('1.12', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#float')))


In [None]:
# Q2) Materials synthesized by MOCVD
q2 = """PREFIX ex: <http://example.org/mse#>
SELECT ?material
WHERE {
  ?m a ex:Material ;
     rdfs:label ?material ;
     ex:synthesizedBy ?meth .
  ?meth rdfs:label "MOCVD" .
}
"""
for row in g.query(q2, initNs={"rdfs": RDFS}):
    print(row)

(rdflib.term.Literal('Indium Phosphide'),)
(rdflib.term.Literal('Gallium Nitride'),)


In [12]:

# Q3) Materials with diamond cubic structure
q3 = """PREFIX ex: <http://example.org/mse#>
SELECT ?material
WHERE {
  ?m a ex:Material ;
     rdfs:label ?material ;
     ex:hasCrystalStructure ?cs .
  ?cs rdfs:label "Diamond cubic" .
}
"""
for row in g.query(q3, initNs={"rdfs": RDFS}):
    print(row)

(rdflib.term.Literal('Silicon'),)
(rdflib.term.Literal('Germanium'),)


## 6) Creating safeguard for possible problems in data scrapping!!
keep a few small rules here to catch obvious issues (labels missing, negative band gaps, etc.).

In [13]:
problems = []

# A) All Materials should have labels
for s in g.subjects(RDF.type, EX.Material):
    if (s, RDFS.label, None) not in g:
        problems.append(f"Material without label: {s}")

# B) Band gap must be numeric and non-negative
for s,p,o in g.triples((None, EX.hasBandGap, None)):
    try:
        if float(o) < 0:
            problems.append(f"Negative band gap for {s}")
    except Exception:
        problems.append(f"Non-numeric band gap for {s}: {o}")

print("No obvious problems ✅" if not problems else "Consistency problems:")
for x in problems:
    print("-", x)


No obvious problems ✅



## 7) Placeholder for LLM‑assisted extraction
When I replace this stub with a real LLM/NLP call, I’ll feed abstracts/tables and get back candidate triples to add to the graph.


In [14]:
def propose_triples_from_text(text: str):
    # demo placeholder: pretend I parsed that GaN has Eg ~3.4 eV
    return [(EX.GaN, EX.hasBandGap, Literal(3.4, datatype=XSD.float))]

for s,p,o in propose_triples_from_text("GaN has band gap ~3.4 eV"):
    g.add((s,p,o))

print("Triples after stub insert:", len(g))

Triples after stub insert: 77



## 8) Save again after updates
I keep the TTL in sync with the in‑memory graph.


In [16]:
TTL_OUT.write_bytes(g.serialize(format="turtle", encoding="utf-8"))
print("Updated:", TTL_OUT.resolve())

Updated: E:\Projects\Semantic_models_for-MSE\data\semiconductors_small.ttl


In [19]:
from pyvis.network import Network

def display_label(term):
    """Human label for a node/edge: prefer rdfs:label, then QName, then short str."""
    lab = g.value(term, RDFS.label)
    if lab:
        return str(lab).strip()
    # QName only for URIRefs
    if isinstance(term, URIRef):
        try:
            return g.namespace_manager.normalizeUri(term)
        except Exception:
            pass
    s = str(term).strip()
    return s

def node_id(term):
    """Stable ID for pyvis (avoid raw labels)."""
    # Use actual URI for URIRefs/BNodes; fallback to hash
    if isinstance(term, (URIRef, BNode)):
        return str(term)
    return f"lit:{hash((str(term), type(term).__name__))}"

def node_style(term):
    """Color by class where possible (Material / CrystalStructure / SynthesisMethod)."""
    # Try to infer from rdf:type (lightweight check)
    types = set(g.objects(term, RDF.type)) if isinstance(term, (URIRef, BNode)) else set()
    # Resolve EX namespace if present
    ns = dict(g.namespace_manager.namespaces())
    EX = ns.get("ex")
    def is_type(tname):
        return any(str(t).endswith(f"#{tname}") or str(t).endswith(f"/{tname}") for t in types)
    if is_type("Material"):
        return dict(color="#2b8a3e", shape="ellipse")
    if is_type("CrystalStructure"):
        return dict(color="#1c7ed6", shape="ellipse")
    if is_type("SynthesisMethod"):
        return dict(color="#e8590c", shape="ellipse")
    if isinstance(term, Literal):
        return dict(color="#bfbfbf", shape="box")
    return dict(color="#666666", shape="ellipse")

def visualize_graph_pyvis(g, max_edges=1500, show_literals=False, height="700px"):
    net = Network(height=height, width="100%", directed=True, notebook=True,
                  cdn_resources="in_line")  # avoid the Jupyter warning
    net.toggle_physics(True)

    added = set()
    edge_count = 0

    for s, p, o in g.triples((None, None, None)):
        if edge_count >= max_edges:
            break

        # skip literal nodes unless asked
        if not show_literals and isinstance(o, Literal):
            # still add edge to a small boxed literal if you like:
            # continue
            pass

        sid = node_id(s); so = node_id(o)
        if sid not in added:
            net.add_node(sid, label=display_label(s), **node_style(s))
            added.add(sid)
        if show_literals or not isinstance(o, Literal):
            if so not in added:
                net.add_node(so, label=display_label(o), **node_style(o))
                added.add(so)

        net.add_edge(sid, so, label=display_label(p))
        edge_count += 1

    net.show("kg.html")
    print("Wrote: kg.html (open this file in your browser)")

visualize_graph_pyvis(g, max_edges=1000, show_literals=True)

kg.html
Wrote: kg.html (open this file in your browser)
