In [7]:
import os
from getpass import getpass

api_key = os.getenv("MP_API_KEY")

if "MP_API_KEY" not in os.environ:
    os.environ["MP_API_KEY"] = getpass("Enter your MP_API_KEY (hidden input): ")


# Data parsing
**Miniature semiconductor systems**  
**Steps (via MP-api)**  
*Setting API → select FIELDS (incl. structure, use nelements field) → define chemistry menus (III–V, II–VI, group-IV binaries; III–V–N, III–V–VI ternaries)*  
*→ build pairs/ternaries (dedup, guarantee ternaries are triples) → fetch helper (materials.summary.search(elements=..., num_elements=..., chunk_size, num_chunks=1); suppress progress)*  
*→ collect (loop binaries → then ternaries until TARGET_N; keep only 0<Eg<4 eV; track seen by material_id) → serialize (JSON.gz full: structure.as_dict(), Element→symbol; CSV.gz comp-only)*

In [19]:
# setting env var outside Python to silence tqdm globally (preferred)
# Windows (CMD):   set DISABLE_TQDM=1
# PowerShell:      $env:DISABLE_TQDM="1"
# macOS/Linux:     export DISABLE_TQDM=1

import os, sys, gzip, json, itertools, contextlib
import pandas as pd
from mp_api.client import MPRester

# ----------------------------
# config
# ----------------------------

INCLUDE_STRUCTURE = True
TARGET_N = 10_000
EG_MIN, EG_MAX = 0.0, 4.0     # setting semiconductor-ish window

# setting fields (include 'structure' for structure-based pipeline)
# note: 'nelements' is a FIELD on docs; 'num_elements' is the FILTER param
FIELDS = [
    "material_id", "formula_pretty", "elements", "nelements",
    "formation_energy_per_atom", "energy_above_hull", "is_stable",
    "band_gap", "is_metal", "density"
]
if INCLUDE_STRUCTURE:
    FIELDS.append("structure")

# setting element families commonly seen in semiconductors
group_IV = ["C","Si","Ge","Sn"]
III      = ["B","Al","Ga","In"]
V        = ["N","P","As","Sb","Bi"]
II       = ["Zn","Cd","Hg","Mg","Be","Ca","Sr","Ba"]
VI       = ["O","S","Se","Te"]
halides  = ["F","Cl","Br","I"]  # available if you want I–VII

# ----------------------------
# utilities
# ----------------------------

# setting a context manager to suppress tqdm/progress output from mp_api calls
@contextlib.contextmanager
def suppress_progress():
    with open(os.devnull, "w") as devnull:
        old_out, old_err = sys.stdout, sys.stderr
        try:
            sys.stdout, sys.stderr = devnull, devnull
            yield
        finally:
            sys.stdout, sys.stderr = old_out, old_err

# setting builders for pairs and ternaries
def build_pairs():
    pairs = []
    pairs += list(itertools.product(III, V))            # III–V
    pairs += list(itertools.product(II, VI))            # II–VI
    pairs += list(itertools.combinations(group_IV, 2))  # group-IV binaries
    
    pairs += list(itertools.product(group_IV, VI))     # IV–VI (e.g., SnSe, GeS)
    pairs += list(itertools.product(["Cu","Ag"], VI))  # I–VI (for chalco precursors)
    seen, ordered = set(), []
    for p in pairs:
        key = tuple(sorted(p))
        if key not in seen:
            seen.add(key)
            ordered.append(tuple(p))
    return ordered

def build_ternaries():
    s = set()
    # III–V–N
    for a, b in itertools.product(III, V):
        s.add(tuple(sorted((a, b, "N"))))
    # III–V–VI
    for a, b, c in itertools.product(III, V, VI):
        s.add(tuple(sorted((a, b, c))))
    
    for a, b, c in itertools.product(["Cu","Ag"], III, VI):  # I–III–VI2 chalcopyrites
        s.add(tuple(sorted((a, b, c))))
    for a, b, c in itertools.product(II, group_IV, V):       # II–IV–V2 (e.g., ZnSiAs2)
        s.add(tuple(sorted((a, b, c))))
    out = [t for t in s if isinstance(t, (tuple, list)) and len(t) == 3]
    return sorted(out)

pairs = build_pairs()
ternaries = build_ternaries()

# setting fetch helper using 'num_elements' filter; suppressing progress per call
def fetch_elements_combo(required_elements, num_elements=2, chunk_size=400, max_chunks=30):
    out = []
    with MPRester(os.environ["MP_API_KEY"]) as mpr:
        for _ in range(max_chunks):
            with suppress_progress():
                docs = mpr.materials.summary.search(
                    elements=list(required_elements),
                    num_elements=num_elements,
                    fields=FIELDS,
                    num_chunks=1,
                    chunk_size=chunk_size
                )
            if not docs:
                break
            out.extend(docs)
            if len(docs) < chunk_size:
                break
    return out

# ----------------------------
# collection + band gap filter
# ----------------------------

seen = set()
collected = []

def add_docs(docs):
    added = 0
    for d in docs:
        mid = getattr(d, "material_id", None)
        if not mid or mid in seen:
            continue
        Eg = getattr(d, "band_gap", None)
        if Eg is None or not (EG_MIN < Eg < EG_MAX):
            continue
        collected.append(d)
        seen.add(mid)
        added += 1
    return added

# ----------------------------
# run: binaries first, then ternaries
# ----------------------------

for a, b in pairs:
    add_docs(fetch_elements_combo([a, b], num_elements=2, chunk_size=400, max_chunks=30))
    if len(collected) >= TARGET_N:
        break

if len(collected) < TARGET_N:
    for a, b, c in ternaries:
        add_docs(fetch_elements_combo([a, b, c], num_elements=3, chunk_size=400, max_chunks=25))
        if len(collected) >= TARGET_N:
            break

print(f"Collected {len(collected)} unique semiconductor-like entries (binaries/ternaries).")

# ----------------------------
# serialization helpers (JSON-safe)
# ----------------------------

def to_symbol_list(elems):
    # converting Element objects -> symbols; leaving strings untouched
    try:
        if elems and not isinstance(elems[0], str):
            return [getattr(e, "symbol", str(e)) for e in elems]
    except Exception:
        pass
    return elems

def structure_to_dict(s):
    # converting Structure to plain dict; handling dict/None gracefully
    if s is None:
        return None
    if isinstance(s, dict):
        return s
    # try as_dict(), else try to_json() -> dict
    try:
        return s.as_dict()
    except Exception:
        try:
            return json.loads(s.to_json())
        except Exception:
            return str(s)  # last resort: stringify

class MPJSONEncoder(json.JSONEncoder):
    # handling pymatgen Element, numpy types, and MSONable objects
    def default(self, o):
        try:
            from pymatgen.core.periodic_table import Element
            if isinstance(o, Element):
                return o.symbol
        except Exception:
            pass
        try:
            import numpy as np
            if isinstance(o, (np.integer,)):
                return int(o)
            if isinstance(o, (np.floating,)):
                return float(o)
            if isinstance(o, (np.ndarray,)):
                return o.tolist()
        except Exception:
            pass
        if hasattr(o, "as_dict"):
            return o.as_dict()
        return super().default(o)

# ----------------------------
# material dicts
# ----------------------------

def doc_to_plain(d):
    base = {
        "material_id": d.material_id,
        "formula_pretty": d.formula_pretty,
        "elements": to_symbol_list(d.elements),
        "nelements": d.nelements,
        "formation_energy_per_atom": d.formation_energy_per_atom,
        "energy_above_hull": d.energy_above_hull,
        "is_stable": d.is_stable,
        "band_gap": d.band_gap,
        "is_metal": d.is_metal,
        "density": d.density,
    }
    if INCLUDE_STRUCTURE and getattr(d, "structure", None) is not None:
        base["structure"] = structure_to_dict(d.structure)
    return base

plain = [doc_to_plain(d) for d in collected]

# ----------------------------
# write: JSON (full) + CSV (composition-only)
# ----------------------------

os.makedirs("data", exist_ok=True)
suffix = "with_struct" if INCLUDE_STRUCTURE else "comp_only"

# writing full json.gz (safe encoder)
json_path = f"data/semiconductors_full_{suffix}.json.gz"
with gzip.open(json_path, "wt") as f:
    json.dump(plain, f, cls=MPJSONEncoder)
print(f"✅ saved JSON → {json_path} (records: {len(plain)})")

# writing CSV.gz (composition-only; omitting 'structure')
csv_cols = ["material_id","formula_pretty","elements","nelements",
            "formation_energy_per_atom","energy_above_hull","is_stable",
            "band_gap","is_metal","density"]
df = pd.DataFrame([{k: rec.get(k) for k in csv_cols} for rec in plain])
csv_path = f"data/semiconductors_comp_only.csv.gz"
df.to_csv(csv_path, index=False, compression="gzip")
print(f"✅ saved CSV  → {csv_path} (rows: {len(df)})")

# quick peek
print(df.shape)
print(df.head(3))

Collected 1285 unique semiconductor-like entries (binaries/ternaries).
✅ saved JSON → data/semiconductors_full_with_struct.json.gz (records: 1285)
✅ saved CSV  → data/semiconductors_comp_only.csv.gz (rows: 1285)
(1285, 10)
  material_id formula_pretty elements  nelements  formation_energy_per_atom  \
0  mp-1244872             BN   [B, N]          2                  -0.771438   
1  mp-1244917             BN   [B, N]          2                  -0.496787   
2  mp-1244943             BN   [B, N]          2                  -0.756250   

   energy_above_hull  is_stable  band_gap  is_metal   density  
0           0.641541      False    1.5766     False  1.574835  
1           0.916192      False    1.1992     False  1.588014  
2           0.656729      False    1.1885     False  1.708090  
