## Summary of Notebook by Razieh Morad
> **Problem:** Predict PBE band gap (eV) from composition features.  
> **Data:** `matbench_mp_gap` (Materials Project; \(N \approx 10^5\) rows).  
> **Approach:** Composition featurization (matminer) → Ridge vs. XGBoost; splits = IID (stratified by binned gap) + OOD (hold out chosen elements).  
> **Metrics:** MAE / RMSE / \(R^2\) on **test**; parity & error-by-chemistry plots.  
> **Primary results:** 
> XGB (MAE **0.504**; R² **0.859**) beats Ridge (MAE **0.673**; R² **0.768**) on IID . Under a chemistry shift holding out {**Bi, Te**}, XGB’s MAE rises to **0.553** (≈ **+9.7%** vs IID) and R² drops to **0.654** — indicating some OOD sensitivity.
.”


# 1. Imports 

In [None]:
# Repro + paths
import os, sys, math, json, pathlib, warnings
from pathlib import Path
from getpass import getpass

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict


from mp_api.client import MPRester
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from pymatgen.io.cif import CifWriter


from matminer.featurizers.composition import ElementProperty, ValenceOrbital, AtomicOrbitals, IonProperty
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.base import MultipleFeaturizer

from matminer.datasets import load_dataset
from pymatgen.core import Structure
from pymatgen.core import Composition

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

warnings.filterwarnings("ignore")
np.random.seed(42)

ROOT = Path.cwd()
DATA = ROOT/ "data"
FIGS = ROOT / "reports" / "figures"
FIGS.mkdir(parents=True, exist_ok=True)


# 2. Load datasets

In [None]:
api_key = "VzNVmBMga05iZbWxZz3XFrV74z9ih3jE"

In [None]:
# building a clean MP “features” 
FIELDS = [
    "material_id","formula_pretty","symmetry","nsites","energy_per_atom","volume",
    "energy_above_hull","total_magnetization"
]

with MPRester(api_key) as mpr:
    docs = mpr.materials.summary.search(
        num_elements=[2, 4],    # range inclusive [2,4]
        energy_above_hull=(0, 0.001),
        fields=FIELDS
    )

records = []
for doc in docs:
    row = {
        "material_id": doc.material_id,
        "formula": doc.formula_pretty,
        "num_sites": doc.nsites,
        "energy_per_atom": doc.energy_per_atom,
        "volume": doc.volume,
        "energy_above_hull": doc.energy_above_hull,
        "total_magnetization": doc.total_magnetization
    }
    if doc.symmetry:
        row["spacegroup_number"] = doc.symmetry.number
        row["crystal_system"] = doc.symmetry.crystal_system
    records.append(row)

mp_df_raw = pd.DataFrame(records)
# one representative per formula: lowest E_hull, then smallest volume
mp_df = (mp_df_raw
         .sort_values(["energy_above_hull","volume"], ascending=[True,True])
         .drop_duplicates("formula", keep="first")
         .reset_index(drop=True))
print("One representative per formula (2–4 elements)", len(mp_df))
mp_df.head()


In [None]:
#  Standardize & union experimental band-gap datasets 
def normalize_formula(s):
    try:
        return Composition(str(s)).reduced_formula
    except:  # bad rows
        return None

# ---- experimental band gap datasets 1----
exp1 = load_dataset("matbench_expt_gap")
exp1.head(5)
exp1=exp1.rename(columns={"composition": "formula" ,"gap expt" :  "gap_expt" })
exp1["formula"] = exp1["formula"].apply(normalize_formula)
exp1 = exp1[["formula", "gap_expt"]]
print(f"Experimental #1 band gap entries:  {len(exp1)}")
exp2 = load_dataset("expt_gap")
exp2.head(5)
exp2 = exp2.rename(columns={"formula" : "formula", "gap expt" :  "gap_expt"})
exp2["formula"] = exp2["formula"].apply(normalize_formula)
exp2 = exp2[["formula", "gap_expt"]]
print(f"Experimental #1 band gap entries:  {len(exp2)}")

# union of experimental datasets
expt = (pd.concat([exp1, exp2], ignore_index=True)
          .dropna(subset=["formula","gap_expt"])
          .query("gap_expt > 0")
          .drop_duplicates("formula", keep="first")
          .reset_index(drop=True))
expt.head(), expt.shape

In [None]:
# Merge MP features and experimental gaps 
df = mp_df.merge(expt, on="formula", how="inner").reset_index(drop=True)
print("Final merged dataset:", df.shape)


In [None]:
#  Quick Visualization 
sns.countplot(df['crystal_system'])

# 3. Featurize composition

In [None]:
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty, ValenceOrbital, AtomicOrbitals, IonProperty
import numpy as np

# normalized formula already; create Composition object
tmp = pd.DataFrame({"formula": df["formula"]})
tmp = StrToComposition(target_col_id="composition_obj").featurize_dataframe(tmp, "formula")

featurizer = MultipleFeaturizer([
    ElementProperty.from_preset("magpie"),
    ValenceOrbital(),
    AtomicOrbitals(),
    IonProperty(),
])
# base numerical cols
base_cols = ["num_sites","energy_per_atom","volume","energy_above_hull","total_magnetization","spacegroup_number"]

# one-hot the crystal system (categorical)
crystal = pd.get_dummies(df["crystal_system"].astype("category"), prefix="xtal", dummy_na=True)

# featurizing formulae
feat = featurizer.featurize_dataframe(tmp, "composition_obj", ignore_errors=True)
feat = feat.drop(columns=["formula","composition_obj"])

# dealing with categorical features
homo= pd.get_dummies(feat["HOMO_character"].astype("category"), prefix="homo", dummy_na=True)
lumo= pd.get_dummies(feat["LUMO_character"].astype("category"), prefix="lumo", dummy_na=True)
feat = feat.drop(columns=["HOMO_character","LUMO_character"])

# Add Atomic Weight and Covalent Radius

X = pd.concat([feat, homo,lumo, df[base_cols], crystal], axis=1)
X = X.replace([np.inf,-np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

# drop unwanted categorical columns 
cat = X.select_dtypes(include = ['object']).columns
X = X.drop(cat,axis=1)
X.dropna(axis=0)
# X = X.drop(columns=['num_sites','volume'])

y = df["gap_expt"].astype(float).values
X.shape, y.shape

# Save the processed data into data folder

pd.DataFrame(X,y).to_csv("../data/raw/band_gap_preprocessed.csv",index=0)


# 4. Proper splits + metrics

In [None]:
# IID split with stratification by binned target
bins = pd.qcut(y, q=10, duplicates="drop")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=bins
)

# 5. Baselines: Ridge (with scaler) + XGBoost

In [None]:
# Ridge
ridge = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0, random_state=42))])
ridge.fit(X_train, y_train)
pred_r = ridge.predict(X_test)

# XGB
xgb = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42, n_jobs=-1
)
xgb.fit(X_train, y_train)
pred_x = xgb.predict(X_test)

def report(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    #rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f"{name:6s} — MAE {mae:.3f} | R² {r2:.3f}")

report(y_test, pred_r, "Ridge")
report(y_test, pred_x, "XGB")



# 6. Parity + error slicing

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def parity(y_true, y_pred, title):
    plt.figure(figsize=(5,5))
    lim = (0, max(y_true.max(), y_pred.max()))
    plt.scatter(y_true, y_pred, s=6, alpha=0.5)
    plt.plot(lim, lim, "--")
    plt.xlabel("True band gap (eV)"); plt.ylabel("Predicted (eV)"); plt.title(title)
    plt.xlim(lim); plt.ylim(lim); plt.tight_layout()
    plt.savefig(f"reports/figures/parity_{title.replace(' ','_').lower()}.png", dpi=160)
    plt.show()

parity(y_test, pred_r, "Ridge")
parity(y_test, pred_x, "XGB")

# error by crystal system
test_idx = X_test.index
err = pd.DataFrame({
    "crystal_system": df.loc[test_idx, "crystal_system"].astype("category"),
    "abs_err": np.abs(y_test - pred_x)
})
plt.figure(figsize=(6,3))
sns.barplot(data=err, x="crystal_system", y="abs_err", estimator=np.mean)
plt.xticks(rotation=30); plt.tight_layout()
plt.savefig("reports/figures/err_by_xtal_xgb.png", dpi=160)
plt.show()


# 7. OOD hold-out by elements (chemistry shift)
> An out-of-distribution (ODD) test where you train on compounds that do not contain certain elements and then test only on compounds that do.
It simulates cold-start chemistry (new element families) to measure how your model generalizes when the data distribution changes.

In [None]:
from pymatgen.core import Composition

elem_set = {"Te","Bi"}  # adjust
def has_any(formula, target):
    try:
        return any(el.symbol in target for el in Composition(formula).elements)
    except:
        return False

mask_ood = df["formula"].apply(lambda f: has_any(f, elem_set))
X_tr, y_tr = X[~mask_ood], y[~mask_ood]
X_ood, y_ood = X[mask_ood], y[mask_ood]

xgb_ood = XGBRegressor(
    n_estimators=500, learning_rate=0.05, max_depth=8,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42, n_jobs=-1
)
xgb_ood.fit(X_tr, y_tr)
pred_ood = xgb_ood.predict(X_ood)
report(y_ood, pred_ood, f"XGB OOD {sorted(elem_set)}")


# 8. Save artifacts

In [None]:
import joblib, os
os.makedirs("models", exist_ok=True)
joblib.dump(xgb, "models/xgb_expt_gap.joblib")
df.assign(pred_xgb_full=xgb.predict(X)).to_csv("../data/processed/expt_gap_features_preds.csv", index=False)
