In [None]:
%pip install mp-api pymatgen matminer pandas scikit-learn matplotlib

%pip freeze > requirements.txt
%pip install -r requirements.txt


In [None]:
import pandas as pd
from mp_api.client import MPRester
from pymatgen.core import Element
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt


In [None]:
API_KEY = "n4VeRex2uzZ3j0mSztAco3xWB5wIKqf5"
mpr = MPRester(API_KEY)

data = mpr.materials.summary.search(
    band_gap=(0.1, None),
    deprecated=False,
    num_chunks=10,
    chunk_size=100
)

df_real = pd.DataFrame([d.model_dump() for d in data])



In [None]:

df_real.rename(columns={
    "band_gap": "Bandgap",
    "formation_energy_per_atom": "Formation Energy",
    "energy_above_hull": "Stability",
    "formula_pretty": "Formula",
    "density": "Density",
    "volume": "Volume"
}, inplace=True)


def mean_electronegativity(elements):
    try:
        return sum(Element(e).X for e in elements) / len(elements)
    except:
        return None

df_real["Mean Electronegativity"] = df_real["elements"].apply(mean_electronegativity)

df_real.dropna(subset=["Formation Energy", "Stability", "Density", "Volume", "Mean Electronegativity"], inplace=True)

print(f"Total materials after cleaning: {len(df_real)}")
df_real.head()



In [None]:

X = df_real[["Formation Energy", "Stability", "Density", "Volume", "Mean Electronegativity"]]
y = df_real["Bandgap"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)


In [None]:
from xgboost import XGBRegressor

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42, verbosity=0)
xgb_model.fit(X_train, y_train)
print("XGBoost model training completed!")

# Predict
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("Predicted Bandgap values:", y_pred)
print("Actual Bandgap values:", y_test.values)
print("Mean Absolute Error (MAE):", mae)


In [None]:
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
model.fit(X_train, y_train)
print("Model training completed!")


In [None]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print("Predicted Bandgap values:", y_pred)
print("Actual Bandgap values:", y_test.values)
print("Mean Absolute Error (MAE):", mae)


In [None]:
import matplotlib.pyplot as plt

# Plot predicted vs actual bandgap values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.7, color="royalblue", edgecolors='k')
plt.plot([y.min(), y.max()], [y.min(), y.max()], '--r', linewidth=2)
plt.xlabel("Actual Bandgap")
plt.ylabel("Predicted Bandgap")
plt.title("XGBoost: Actual vs Predicted Bandgap")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', label='Predicted vs. Actual')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal Prediction')
plt.xlabel("Actual Bandgap (eV)")
plt.ylabel("Predicted Bandgap (eV)")
plt.title("Predicted vs. Actual Bandgap (Gradient Boosting)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(8, 6))
plt.barh(feature_names, importances, color='skyblue')
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance for Bandgap Prediction")
plt.show()
