In [7]:
import os
from getpass import getpass

api_key = os.getenv("MP_API_KEY")

if "MP_API_KEY" not in os.environ:
    os.environ["MP_API_KEY"] = getpass("Enter your MP_API_KEY (hidden input): ")


# Data parsing
**Miniature semiconductor systems**  
**Steps (via MP-api)**  
*Setting API → select FIELDS (incl. structure, use nelements field) → define chemistry menus (III–V, II–VI, group-IV binaries; III–V–N, III–V–VI ternaries)*  
*→ build pairs/ternaries (dedup, guarantee ternaries are triples) → fetch helper (materials.summary.search(elements=..., num_elements=..., chunk_size, num_chunks=1); suppress progress)*  
*→ collect (loop binaries → then ternaries until TARGET_N; keep only 0<Eg<4 eV; track seen by material_id) → serialize (JSON.gz full: structure.as_dict(), Element→symbol; CSV.gz comp-only)*

In [20]:
# setting env var outside Python to silence tqdm globally (preferred)
# Windows (CMD):   set DISABLE_TQDM=1
# PowerShell:      $env:DISABLE_TQDM="1"
# macOS/Linux:     export DISABLE_TQDM=1

import os, sys, gzip, json, itertools, contextlib
import pandas as pd
from mp_api.client import MPRester

# ----------------------------
# config
# ----------------------------

INCLUDE_STRUCTURE = True
TARGET_N = 10_000
EG_MIN, EG_MAX = 0.0, 4.0     # setting semiconductor-ish window

# setting fields (include 'structure' for structure-based pipeline)
# note: 'nelements' is a FIELD on docs; 'num_elements' is the FILTER param
FIELDS = [
    "material_id", "formula_pretty", "elements", "nelements",
    "formation_energy_per_atom", "energy_above_hull", "is_stable",
    "band_gap", "is_metal", "density"
]
if INCLUDE_STRUCTURE:
    FIELDS.append("structure")

# setting element families commonly seen in semiconductors
group_IV = ["C","Si","Ge","Sn"]
III      = ["B","Al","Ga","In"]
V        = ["N","P","As","Sb","Bi"]
II       = ["Zn","Cd","Hg","Mg","Be","Ca","Sr","Ba"]
VI       = ["O","S","Se","Te"]
halides  = ["F","Cl","Br","I"]  # available if you want I–VII

# ----------------------------
# utilities
# ----------------------------

# setting a context manager to suppress tqdm/progress output from mp_api calls
@contextlib.contextmanager
def suppress_progress():
    with open(os.devnull, "w") as devnull:
        old_out, old_err = sys.stdout, sys.stderr
        try:
            sys.stdout, sys.stderr = devnull, devnull
            yield
        finally:
            sys.stdout, sys.stderr = old_out, old_err

# setting builders for pairs and ternaries
def build_pairs():
    pairs = []
    pairs += list(itertools.product(III, V))            # III–V
    pairs += list(itertools.product(II, VI))            # II–VI
    pairs += list(itertools.combinations(group_IV, 2))  # group-IV binaries
    
    pairs += list(itertools.product(group_IV, VI))     # IV–VI (e.g., SnSe, GeS)
    pairs += list(itertools.product(["Cu","Ag"], VI))  # I–VI (for chalco precursors)
    seen, ordered = set(), []
    for p in pairs:
        key = tuple(sorted(p))
        if key not in seen:
            seen.add(key)
            ordered.append(tuple(p))
    return ordered

def build_ternaries():
    s = set()
    # III–V–N
    for a, b in itertools.product(III, V):
        s.add(tuple(sorted((a, b, "N"))))
    # III–V–VI
    for a, b, c in itertools.product(III, V, VI):
        s.add(tuple(sorted((a, b, c))))
    
    for a, b, c in itertools.product(["Cu","Ag"], III, VI):  # I–III–VI2 chalcopyrites
        s.add(tuple(sorted((a, b, c))))
    for a, b, c in itertools.product(II, group_IV, V):       # II–IV–V2 (e.g., ZnSiAs2)
        s.add(tuple(sorted((a, b, c))))
    out = [t for t in s if isinstance(t, (tuple, list)) and len(t) == 3]
    return sorted(out)

pairs = build_pairs()
ternaries = build_ternaries()

# setting fetch helper using 'num_elements' filter; suppressing progress per call
def fetch_elements_combo(required_elements, num_elements=2, chunk_size=400, max_chunks=100):
    out = []
    with MPRester(os.environ["MP_API_KEY"]) as mpr:
        for _ in range(max_chunks):
            with suppress_progress():
                docs = mpr.materials.summary.search(
                    elements=list(required_elements),
                    num_elements=num_elements,
                    fields=FIELDS,
                    num_chunks=1,
                    chunk_size=chunk_size
                )
            if not docs:
                break
            out.extend(docs)
            if len(docs) < chunk_size:
                break
    return out

# ----------------------------
# collection + band gap filter
# ----------------------------

seen = set()
collected = []

def add_docs(docs):
    added = 0
    for d in docs:
        mid = getattr(d, "material_id", None)
        if not mid or mid in seen:
            continue
        Eg = getattr(d, "band_gap", None)
        if Eg is None or not (EG_MIN < Eg < EG_MAX):
            continue
        collected.append(d)
        seen.add(mid)
        added += 1
    return added

# ----------------------------
# run: binaries first, then ternaries
# ----------------------------

for a, b in pairs:
    add_docs(fetch_elements_combo([a, b], num_elements=2, chunk_size=400, max_chunks=30))
    if len(collected) >= TARGET_N:
        break

if len(collected) < TARGET_N:
    for a, b, c in ternaries:
        add_docs(fetch_elements_combo([a, b, c], num_elements=3, chunk_size=400, max_chunks=25))
        if len(collected) >= TARGET_N:
            break

print(f"Collected {len(collected)} unique semiconductor-like entries (binaries/ternaries).")

# ----------------------------
# serialization helpers (JSON-safe)
# ----------------------------

def to_symbol_list(elems):
    # converting Element objects -> symbols; leaving strings untouched
    try:
        if elems and not isinstance(elems[0], str):
            return [getattr(e, "symbol", str(e)) for e in elems]
    except Exception:
        pass
    return elems

def structure_to_dict(s):
    # converting Structure to plain dict; handling dict/None gracefully
    if s is None:
        return None
    if isinstance(s, dict):
        return s
    # try as_dict(), else try to_json() -> dict
    try:
        return s.as_dict()
    except Exception:
        try:
            return json.loads(s.to_json())
        except Exception:
            return str(s)  # last resort: stringify

class MPJSONEncoder(json.JSONEncoder):
    # handling pymatgen Element, numpy types, and MSONable objects
    def default(self, o):
        try:
            from pymatgen.core.periodic_table import Element
            if isinstance(o, Element):
                return o.symbol
        except Exception:
            pass
        try:
            import numpy as np
            if isinstance(o, (np.integer,)):
                return int(o)
            if isinstance(o, (np.floating,)):
                return float(o)
            if isinstance(o, (np.ndarray,)):
                return o.tolist()
        except Exception:
            pass
        if hasattr(o, "as_dict"):
            return o.as_dict()
        return super().default(o)

# ----------------------------
# material dicts
# ----------------------------

def doc_to_plain(d):
    base = {
        "material_id": d.material_id,
        "formula_pretty": d.formula_pretty,
        "elements": to_symbol_list(d.elements),
        "nelements": d.nelements,
        "formation_energy_per_atom": d.formation_energy_per_atom,
        "energy_above_hull": d.energy_above_hull,
        "is_stable": d.is_stable,
        "band_gap": d.band_gap,
        "is_metal": d.is_metal,
        "density": d.density,
    }
    if INCLUDE_STRUCTURE and getattr(d, "structure", None) is not None:
        base["structure"] = structure_to_dict(d.structure)
    return base

plain = [doc_to_plain(d) for d in collected]

# ----------------------------
# write: JSON (full) + CSV (composition-only)
# ----------------------------

os.makedirs("data", exist_ok=True)
suffix = "with_struct" if INCLUDE_STRUCTURE else "comp_only"

# writing full json.gz (safe encoder)
json_path = f"data/semiconductors_full_{suffix}.json.gz"
with gzip.open(json_path, "wt") as f:
    json.dump(plain, f, cls=MPJSONEncoder)
print(f"✅ saved JSON → {json_path} (records: {len(plain)})")

# writing CSV.gz (composition-only; omitting 'structure')
csv_cols = ["material_id","formula_pretty","elements","nelements",
            "formation_energy_per_atom","energy_above_hull","is_stable",
            "band_gap","is_metal","density"]
df = pd.DataFrame([{k: rec.get(k) for k in csv_cols} for rec in plain])
csv_path = f"data/semiconductors_comp_only.csv.gz"
df.to_csv(csv_path, index=False, compression="gzip")
print(f"✅ saved CSV  → {csv_path} (rows: {len(df)})")

# quick peek
print(df.shape)
print(df.head(3))

Collected 1285 unique semiconductor-like entries (binaries/ternaries).
✅ saved JSON → data/semiconductors_full_with_struct.json.gz (records: 1285)
✅ saved CSV  → data/semiconductors_comp_only.csv.gz (rows: 1285)
(1285, 10)
  material_id formula_pretty elements  nelements  formation_energy_per_atom  \
0  mp-1244872             BN   [B, N]          2                  -0.771438   
1  mp-1244917             BN   [B, N]          2                  -0.496787   
2  mp-1244943             BN   [B, N]          2                  -0.756250   

   energy_above_hull  is_stable  band_gap  is_metal   density  
0           0.641541      False    1.5766     False  1.574835  
1           0.916192      False    1.1992     False  1.588014  
2           0.656729      False    1.1885     False  1.708090  


# Dataset preparation

**a. Composition pipeline**    

load raw comp dataset → standardize columns/types → filter valid rows → attach Composition objects → featurize elemental stats → assemble X,y → persist features → quick QA

*a.1: (load + standardize + initial filtering)*  

In [23]:
# setting paths
RAW_CSV = "data/semiconductors_comp_only.csv.gz"
CLEAN_CSV = "data/comp_raw_clean.csv.gz"

# importing libs
import ast
import pandas as pd

# reading comp-only dataset  
df = pd.read_csv(RAW_CSV)

# standardizing columns expected downstream
required_cols = ["material_id","formula_pretty","elements","nelements","band_gap"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"missing columns in raw file: {missing}")

# converting 'elements' column from string repr -> python list[str]
def _parse_elems(x):
    # handling NaN or already-a-list
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return None
    try:
        obj = ast.literal_eval(x)
        if isinstance(obj, list):
            # enforcing strings (symbols) only
            return [str(e) for e in obj]
    except Exception:
        pass
    # fallback: split on non-letters if a weird string sneaks in
    return [tok for tok in str(x).replace("[","").replace("]","").replace("'","").split(",") if tok.strip()]

df["elements"] = df["elements"].apply(_parse_elems)

# dropping bad rows (no elements list, no formula, invalid Eg)
df = df.dropna(subset=["formula_pretty","elements","band_gap"])
df = df[df["band_gap"].between(0, 4, inclusive="neither")]

# dropping duplicate materials (keeping the first)
if "material_id" in df.columns:
    df = df.drop_duplicates(subset=["material_id"])

# saving cleaned raw for the next substep
df.to_csv(CLEAN_CSV, index=False, compression="gzip")

# counting binaries/ternaries
n_bin  = int((df["nelements"] == 2).sum())
n_ter  = int((df["nelements"] == 3).sum())


# quick peek
print(f"✅ cleaned comp dataset → {CLEAN_CSV} | rows={len(df)}")
print(df[["material_id","formula_pretty","elements","nelements","band_gap"]].head(5))
print(f"binaries: {n_bin} | ternaries: {n_ter} ")

✅ cleaned comp dataset → data/comp_raw_clean.csv.gz | rows=1285
  material_id formula_pretty elements  nelements  band_gap
0  mp-1244872             BN   [B, N]          2    1.5766
1  mp-1244917             BN   [B, N]          2    1.1992
2  mp-1244943             BN   [B, N]          2    1.1885
3  mp-1244991             BN   [B, N]          2    1.5503
4  mp-1245193             BN   [B, N]          2    0.2781
binaries: 619 | ternaries: 666 


*a.2: composition prep*  

load raw comp dataset → attach Composition objects

In [25]:
# setting paths
CLEAN_CSV = "data/comp_raw_clean.csv.gz"
COMP_CACHE_CSV = "data/comp_cache.csv.gz"

# importing libs
import json
import pandas as pd
from pymatgen.core.composition import Composition

# loading cleaned dataset
df = pd.read_csv(CLEAN_CSV)

# setting safe Composition parser
def to_comp_safe(formula: str):
    try:
        return Composition(str(formula))
    except Exception:
        return None

df["comp_obj"] = df["formula_pretty"].apply(to_comp_safe)
df["comp_ok"] = df["comp_obj"].notna()

# dropping rows that failed to parse
n_fail = int((~df["comp_ok"]).sum())
if n_fail:
    print(f"dropping {n_fail} rows that failed Composition() parsing")
df = df[df["comp_ok"]].copy()

# setting element→amount maps (keys are symbols, values are floats)
def el_amt_map(c: Composition):
    d = c.get_el_amt_dict()  # keys already symbols in current pymatgen
    return {str(k): float(v) for k, v in d.items()}

df["comp_el_amt"]   = df["comp_obj"].apply(el_amt_map)
df["comp_mson"]     = df["comp_obj"].apply(lambda c: c.as_dict())

# serializing dicts for CSV
df["comp_el_amt_json"] = df["comp_el_amt"].apply(json.dumps)
df["comp_mson_json"]   = df["comp_mson"].apply(json.dumps)

# saving compact cache for next substep
cache_cols = [
    "material_id","formula_pretty","elements","nelements","band_gap",
    "comp_el_amt_json","comp_mson_json"
]
df[cache_cols].to_csv(COMP_CACHE_CSV, index=False, compression="gzip")

# quick QA
print(f"✅ comp cache saved → {COMP_CACHE_CSV} | rows={len(df)}")
print(df[["material_id","formula_pretty","nelements","band_gap","comp_el_amt"]].head(3))

✅ comp cache saved → data/comp_cache.csv.gz | rows=1285
  material_id formula_pretty  nelements  band_gap           comp_el_amt
0  mp-1244872             BN          2    1.5766  {'B': 1.0, 'N': 1.0}
1  mp-1244917             BN          2    1.1992  {'B': 1.0, 'N': 1.0}
2  mp-1244943             BN          2    1.1885  {'B': 1.0, 'N': 1.0}


*a.3: : featurize elemental stats*

In [28]:
# setting paths
COMP_CACHE_CSV = "data/comp_cache.csv.gz"

from pymatgen.core import Element
import numpy as np

# loading composition cache
df = pd.read_csv(COMP_CACHE_CSV)

# parsing element→amount dicts (unnormalized), then making fractions
def parse_el_amt(s):
    try:
        d = json.loads(s)
        # forcing keys as symbols, values as float
        d = {str(k): float(v) for k, v in d.items()}
        total = sum(d.values())
        if total <= 0:
            return None, None
        fracs = {k: v / total for k, v in d.items()}
        return d, fracs
    except Exception:
        return None, None

el_amt_fracs = df["comp_el_amt_json"].apply(parse_el_amt)
df["el_amt"]   = [t[0] for t in el_amt_fracs]
df["el_fracs"] = [t[1] for t in el_amt_fracs]
df = df.dropna(subset=["el_fracs"]).reset_index(drop=True)

# setting elemental properties to aggregate
# using: atomic number (Z), Pauling electronegativity (X), atomic mass, Mendeleev number, atomic radius
PROP_FUNCS = {
    "Z":               lambda e: e.Z,
    "X":               lambda e: e.X,                 # may be None for some elements
    "mass":            lambda e: float(e.atomic_mass) if e.atomic_mass is not None else np.nan,
    "mendeleev_no":    lambda e: e.mendeleev_no,
    "atomic_radius":   lambda e: e.atomic_radius,     # may be None
}

# setting helper to get numeric array for a property given fraction dict
def prop_values_and_weights(fracs: dict, prop_key: str):
    vals, wts = [], []
    fn = PROP_FUNCS[prop_key]
    for sym, w in fracs.items():
        try:
            e = Element(sym)
            v = fn(e)
        except Exception:
            v = np.nan
        vals.append(np.nan if v is None else float(v))
        wts.append(float(w))
    return np.array(vals, dtype=float), np.array(wts, dtype=float)

# setting weighted stats (fractions already sum to 1)
def weighted_stats(vals: np.ndarray, wts: np.ndarray):
    # drop NaNs in vals (adjust weights)
    mask = ~np.isnan(vals)
    if not np.any(mask):
        return dict(mean=np.nan, std=np.nan, vmin=np.nan, vmax=np.nan, vrange=np.nan)
    v = vals[mask]
    w = wts[mask]
    w = w / w.sum()
    m = np.sum(w * v)
    var = np.sum(w * (v - m) ** 2)
    s = math.sqrt(var)
    vmin, vmax = float(np.min(v)), float(np.max(v))
    return dict(mean=float(m), std=float(s), vmin=vmin, vmax=vmax, vrange=float(vmax - vmin))

# setting composition diversity metrics
def comp_entropy(fracs: dict):
    p = np.array(list(fracs.values()), dtype=float)
    return float(-np.sum(p * np.log(p)))  # natural log

def comp_max_fraction(fracs: dict):
    return float(max(fracs.values()))

# computing features
feat_rows = []
for i, row in df.iterrows():
    fr = row["el_fracs"]
    feats = {
        "material_id": row["material_id"],
        "nelements": row["nelements"],
        "H_composition": comp_entropy(fr),
        "max_elem_frac": comp_max_fraction(fr),
    }
    for pk in PROP_FUNCS.keys():
        vals, wts = prop_values_and_weights(fr, pk)
        st = weighted_stats(vals, wts)
        feats[f"{pk}_mean"]   = st["mean"]
        feats[f"{pk}_std"]    = st["std"]
        feats[f"{pk}_min"]    = st["vmin"]
        feats[f"{pk}_max"]    = st["vmax"]
        feats[f"{pk}_range"]  = st["vrange"]
    feat_rows.append(feats)

comp_feats = pd.DataFrame(feat_rows)

# joining back light metadata for convenience
comp_feats = comp_feats.merge(df[["material_id","formula_pretty","band_gap"]], on="material_id", how="left")

# quick QA
print(f"✅ composition features computed | rows={len(comp_feats)} | cols={comp_feats.shape[1]}")
print(comp_feats.head(3)[[
    "material_id","nelements","H_composition","max_elem_frac",
    "Z_mean","Z_std","X_mean","X_std","mass_mean","mass_std","band_gap"
]])

✅ composition features computed | rows=1285 | cols=31
  material_id  nelements  H_composition  max_elem_frac  Z_mean  Z_std  X_mean  \
0  mp-1244872          2       0.693147            0.5     6.0    1.0    2.54   
1  mp-1244917          2       0.693147            0.5     6.0    1.0    2.54   
2  mp-1244943          2       0.693147            0.5     6.0    1.0    2.54   

   X_std  mass_mean  mass_std  band_gap  
0    0.5   12.40885   1.59785    1.5766  
1    0.5   12.40885   1.59785    1.1992  
2    0.5   12.40885   1.59785    1.1885  


*a.4: assemble X, y*

In [29]:
# setting paths
FEATS_ALL_PATH   = "data/comp_X_all.parquet"
TARGET_ALL_PATH  = "data/comp_y_all.parquet"
XTR_PATH, XVA_PATH = "data/comp_X_train.parquet", "data/comp_X_valid.parquet"
YTR_PATH, YVA_PATH = "data/comp_y_train.parquet", "data/comp_y_valid.parquet"

# importing libs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# expecting 'comp_feats' from previous substep; failing fast if missing
if "comp_feats" not in globals():
    raise RuntimeError("comp_feats not found. Run the previous substep to build composition features.")

# selecting feature columns (numerics only, drop metadata/target)
meta_cols = {"material_id","formula_pretty","band_gap"}
num_df = comp_feats.drop(columns=[c for c in meta_cols if c in comp_feats.columns])
num_df = num_df.select_dtypes(include=[np.number])

# dropping columns with too many NaNs (setting threshold = 30%)
na_ratio = num_df.isna().mean()
drop_cols = list(na_ratio[na_ratio > 0.30].index)
if drop_cols:
    print(f"dropping {len(drop_cols)} cols with >30% NaNs")
    num_df = num_df.drop(columns=drop_cols)

# replacing infs -> NaN then imputing with column medians
num_df = num_df.replace([np.inf, -np.inf], np.nan)
medians = num_df.median(axis=0, numeric_only=True)
num_df = num_df.fillna(medians)

# enforcing float32 for compactness/speed
X = num_df.astype(np.float32)
y = comp_feats["band_gap"].astype(np.float32)

# saving full matrices (for later reuse)
X.to_parquet(FEATS_ALL_PATH, index=False)
y.to_frame("band_gap").to_parquet(TARGET_ALL_PATH, index=False)

# creating a simple train/valid split (setting random_state for reproducibility)
X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=0.2, random_state=42)

# saving split
X_tr.to_parquet(XTR_PATH, index=False)
X_va.to_parquet(XVA_PATH, index=False)
y_tr.to_frame("band_gap").to_parquet(YTR_PATH, index=False)
y_va.to_frame("band_gap").to_parquet(YVA_PATH, index=False)

# quick QA
print(f"✅ X_all: {X.shape} | y_all: {y.shape}")
print(f"   train: {X_tr.shape} / valid: {X_va.shape} | features: {X.shape[1]}")
print(f"   dropped_cols({len(drop_cols)}): {drop_cols[:8]}{' ...' if len(drop_cols)>8 else ''}")
print(f"   y stats → min={float(y.min()):.3f}, median={float(y.median()):.3f}, max={float(y.max()):.3f}")


ModuleNotFoundError: No module named 'sklearn'