# Required Libraries

In [1]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from mp_api.client import MPRester

from pymatgen.core.composition import Composition
from matminer.featurizers.composition import ElementProperty
from matminer.featurizers.structure import DensityFeatures, GlobalSymmetryFeatures
from pymatgen.core import Structure

from pycaret.regression import setup, compare_models, tune_model, finalize_model, predict_model, save_model, load_model
import xgboost as xgb
import joblib

# Data Scrapping

In [2]:
# ðŸ”‘ 3. Fetch Data from Materials Project
API_KEY = '2yqf6FmGq648PC6a4JaNk1jXd5G5LyKo'  # Replace with your Materials Project API key
mpr = MPRester(API_KEY)

# Fetching materials data with specific properties
with MPRester(API_KEY) as mpr:
        entries = mpr.materials.summary.search(
        fields=['formula_pretty', 'formation_energy_per_atom', 'structure'],
        num_chunks=9,
        #num_chunks=None
        chunk_size=500
        )

print(f"Retrieved {len(entries)} Materials with their desired properties.")

Retrieving SummaryDoc documents:   0%|          | 0/4500 [00:00<?, ?it/s]

Retrieved 4500 Materials with their desired properties.


In [3]:
print(entries[0].model_dump())

Lattice
    abc : 5.160296 5.2402447696794665 5.89021299170463
 angles : 90.23517285321574 90.49680784381113 90.25713099844408
 volume : 159.2695518855733
      A : 5.160296 0.0 0.0
      B : -0.023517 5.240192 0.0
      C : -0.051073 -0.024406 5.889941
    pbc : True True True
PeriodicSite: O (4.4080, 2.5940, 5.8225) [0.8663, 0.4996, 0.9886]
PeriodicSite: O (0.4295, 2.2106, 5.7774) [0.0949, 0.4264, 0.9809]
PeriodicSite: O (1.8839, 5.1653, 3.0112) [0.3746, 0.9881, 0.5112]
PeriodicSite: O (3.0761, 0.3396, 3.0839) [0.6016, 0.0672, 0.5236]
PeriodicSite: O (2.5640, 4.5633, 0.0876) [0.5010, 0.8709, 0.0149]
PeriodicSite: O (2.2015, 0.5036, 0.1107) [0.4272, 0.0962, 0.0188]
PeriodicSite: O (4.7429, 2.1596, 2.8439) [0.9258, 0.4144, 0.4828]
PeriodicSite: O (5.0504, 3.3623, 2.8683) [0.9865, 0.6439, 0.4870]
PeriodicSite: O (3.1178, 1.9707, 2.9482) [0.6109, 0.3784, 0.5005]


In [4]:
# --- Step 3: Convert to DataFrame for analysis ---
# Each entry is a SummaryDoc Object; convert to rows
data = []
for i in entries:
    data.append({
        
        "formula": i.formula_pretty,
        "structure": i.structure,
        "formation_energy_per_atom": i.formation_energy_per_atom,
    })

df = pd.DataFrame(data)
df.to_csv("Formation Energy Data.csv", index=False)

In [5]:
df.head()

Unnamed: 0,formula,structure,formation_energy_per_atom
0,O2,"[[4.40802865 2.5939677 5.82250118] O, [0.4294...",0.387014
1,C,"[[7.5298275 6.30191464 0.68331063] C, [8.4740...",0.994253
2,Ti,"[[1.91951051 1.37136319 9.00059297] Ti, [ 8.19...",0.141384
3,Si,"[[0.18969876 3.07592306 5.37172487] Si, [-0.21...",0.349291
4,Nb,"[[-0.99328832 -0.42135479 3.13782546] Nb, [1....",0.189748


# Featurization

In [6]:
# ðŸ§© 4. Feature Engineering
df['composition'] = df['formula'].apply(Composition)

# Composition Features
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id='composition', ignore_errors=True)

# Structure Features
structure_features = []
for s in df['structure']:
    features = {}
    try:
        dens_feat = DensityFeatures().featurize(s)
        features.update(dict(zip(DensityFeatures().feature_labels(), dens_feat)))

        gsf_feat = GlobalSymmetryFeatures().featurize(s)
        features.update(dict(zip(GlobalSymmetryFeatures().feature_labels(), gsf_feat)))
    except Exception as e:
        features = {f: np.nan for f in features.keys()}
    structure_features.append(features)

structure_df = pd.DataFrame(structure_features)

ElementProperty:   0%|          | 0/4500 [00:00<?, ?it/s]

# Data curing

In [7]:
# Merge composition and structure features
X = pd.concat([df, structure_df], axis=1)

# Drop columns that aren't features
X = X.drop(columns=['composition', 'structure', 'formation_energy_per_atom'])

# Drop rows with any missing values
X = X.dropna()

# Now align y with the rows that remain in X
# Since we dropped rows in X, the index might be different.
y = df.loc[X.index, 'formation_energy_per_atom']

In [8]:
#print(X[:5])
print(y[:5])

0    0.387014
1    0.994253
2    0.141384
3    0.349291
4    0.189748
Name: formation_energy_per_atom, dtype: float64


In [9]:
model_data = pd.concat([X, y], axis=1)
model_data.rename(columns={'formation_energy_per_atom': 'target'}, inplace=True)
print(model_data.dtypes)

formula                       object
MagpieData minimum Number    float64
MagpieData maximum Number    float64
MagpieData range Number      float64
MagpieData mean Number       float64
                              ...   
crystal_system                object
crystal_system_int           float64
is_centrosymmetric            object
n_symmetry_ops               float64
target                       float64
Length: 142, dtype: object


In [10]:
non_numeric_cols = model_data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: Index(['formula', 'crystal_system', 'is_centrosymmetric'], dtype='object')


In [11]:
# Make a copy of your model_data
model_data_clean = model_data.copy()

# Drop 'formula' (we already extracted features from it)
model_data_clean = model_data_clean.drop(columns=['formula'])

# Handle 'crystal_system' (one-hot encoding)
model_data_clean = pd.get_dummies(model_data_clean, columns=['crystal_system'])

# Handle 'is_centrosymmetric' (map True/False to 1/0)
# In some cases it might be a string, so let's handle both cases robustly
model_data_clean['is_centrosymmetric'] = model_data_clean['is_centrosymmetric'].map({
    True: 1, False: 0, 
    'True': 1, 'False': 0
})

# Save the last 278 rows as unseen data
unseen_data = model_data_clean.tail(278)
model_data_clean = model_data_clean.iloc[:-278]  # saves the rest for model training and optimization

# Reset index for both datasets
model_data_clean.reset_index(drop=True, inplace=True)
unseen_data.reset_index(drop=True, inplace=True)

# Double-check dtypes
print(model_data_clean.dtypes)
print( "cleaned dataset size for model training and validation:", model_data_clean.shape)
print( "cleaned Unseen dataset size:", unseen_data.shape)


MagpieData minimum Number      float64
MagpieData maximum Number      float64
MagpieData range Number        float64
MagpieData mean Number         float64
MagpieData avg_dev Number      float64
                                ...   
crystal_system_monoclinic         bool
crystal_system_orthorhombic       bool
crystal_system_tetragonal         bool
crystal_system_triclinic          bool
crystal_system_trigonal           bool
Length: 147, dtype: object
cleaned dataset size for model training and validation: (4200, 147)
cleaned Unseen dataset size: (278, 147)


In [12]:
non_numeric_cols_clean = model_data_clean.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols_clean)

Non-numeric columns: Index([], dtype='object')


# Model Selection :
Pycaret is used.  
https://pycaret.gitbook.io/docs  
Models were choosen based on sorting metrics: 1. mean absolute error (mae), i.e., focusing on prediction accuracy  2. R2 score, i.e., focusing on capturing the variance in the training dataset.

1. Model Selection: Metric 'mae'

In [13]:
from pycaret.regression import *
# Setup
regression_setup_mae = setup(
    data=model_data_clean,
    target='target',
    session_id=123,
    fold=5,
    train_size=0.8,
    n_jobs=-1,
    verbose=False,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    #remove_outliers=True,
    #outliers_threshold=0.05
)

# Compare, tune, finalize models
best_model_mae = compare_models(sort='MAE', n_select=3)
tuned_model_mae = tune_model(best_model_mae[0], optimize='MAE', search_library='scikit-optimize')
final_model_mae= finalize_model(tuned_model_mae)

# Save : final_model used the metric R2.
save_model(final_model_mae, 'formation_energy_final_model_mae')

# Load the saved model
model_mae = load_model('formation_energy_final_model_mae')

# Retrieve holdout set
X_test_mae = get_config('X_test')
y_test_mae = get_config('y_test')
X_train_mae = get_config('X_train')
y_train_mae = get_config('y_train')

# Get predictions using the final model
predictions_df_mae = predict_model(model_mae, data=X_test_mae)

# The predictions are in the 'Label' column
y_pred_mae = predictions_df_mae['prediction_label']
y_pred_mae.head()
print("Prediction_mae list size:",  y_pred_mae.shape)
print("target_test_mae list size:",  y_test_mae.shape)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.1772,0.2167,0.4619,0.8223,0.1767,3110118755.3944,1.322
catboost,CatBoost Regressor,0.184,0.176,0.4161,0.8556,0.1693,13029745160.2215,4.686
xgboost,Extreme Gradient Boosting,0.1858,0.1946,0.4366,0.84,0.1744,3676658896.214,0.388
rf,Random Forest Regressor,0.1893,0.1891,0.4339,0.8456,0.1735,5121885299.6563,1.84
lightgbm,Light Gradient Boosting Machine,0.1938,0.1648,0.4054,0.8658,0.1695,12952922459.0655,0.326
gbr,Gradient Boosting Regressor,0.2381,0.2084,0.4557,0.8305,0.1926,11102470356.8445,0.694
dt,Decision Tree Regressor,0.248,0.3414,0.5828,0.7224,0.2252,1904295118.3349,0.174
knn,K Neighbors Regressor,0.3488,0.3701,0.6084,0.6995,0.2651,3464177670.7302,0.158
br,Bayesian Ridge,0.3753,0.3547,0.5946,0.7118,0.2537,35559270014.416,0.15
ridge,Ridge Regression,0.3763,0.3488,0.5898,0.7165,0.2552,29609777587.0569,0.144


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1896,0.2284,0.4779,0.8209,0.1784,3.157
1,0.206,0.2169,0.4657,0.8197,0.1866,14339980130.8158
2,0.2195,0.2691,0.5187,0.7661,0.217,10.4224
3,0.2095,0.2085,0.4566,0.833,0.1943,3.1188
4,0.2147,0.1783,0.4223,0.8622,0.1853,2.4704
Mean,0.2078,0.2202,0.4682,0.8204,0.1923,2867996029.9969
Std,0.0102,0.0295,0.0313,0.0312,0.0133,5735992050.4095


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Prediction_mae list size: (840,)
target_test_mae list size: (840,)


2. Model selection: Metric R2

In [14]:
from pycaret.regression import get_config
# Setup
regression_setup = setup(
    data=model_data_clean,
    target='target',
    session_id=123,
    fold=5,
    train_size=0.8,
    n_jobs=-1,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    verbose=False,
    #remove_outliers=True,
    #outliers_threshold=0.05
    )

# Compare, Tune, Finalize 
best_model = compare_models(sort='R2', n_select=3)
tuned_model = tune_model(best_model[0], optimize='R2', search_library='scikit-optimize')
final_model = finalize_model(tuned_model)

# Save : final_model used the metric R2.
save_model(final_model, 'formation_energy_final_model')

# Load the saved model
model = load_model('formation_energy_final_model')

# Retrieve holdout set
X_test = get_config('X_test')
y_test = get_config('y_test')
X_train = get_config('X_train')
y_train = get_config('y_train')


# Get predictions using the final model
predictions_df = predict_model(model, data=X_test)

# The predictions are in the 'Label' column
y_pred = predictions_df['prediction_label']
y_pred.head()
print("Prediction list size:",  y_pred.shape)
print("Prediction list size:",  y_test.shape)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.1938,0.1648,0.4054,0.8658,0.1695,12952922459.0655,0.372
catboost,CatBoost Regressor,0.184,0.176,0.4161,0.8556,0.1693,13029745160.2215,4.528
rf,Random Forest Regressor,0.1893,0.1891,0.4339,0.8456,0.1735,5121885299.6563,1.934
xgboost,Extreme Gradient Boosting,0.1858,0.1946,0.4366,0.84,0.1744,3676658896.214,0.424
gbr,Gradient Boosting Regressor,0.2381,0.2084,0.4557,0.8305,0.1926,11102470356.8445,0.734
et,Extra Trees Regressor,0.1772,0.2167,0.4619,0.8223,0.1767,3110118755.3944,1.43
dt,Decision Tree Regressor,0.248,0.3414,0.5828,0.7224,0.2252,1904295118.3349,0.19
ridge,Ridge Regression,0.3763,0.3488,0.5898,0.7165,0.2552,29609777587.0569,0.164
lr,Linear Regression,0.3806,0.3498,0.5906,0.7156,0.2571,29128802326.5856,0.166
br,Bayesian Ridge,0.3753,0.3547,0.5946,0.7118,0.2537,35559270014.416,0.158


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2533,0.2857,0.5345,0.7759,0.214,10.4864
1,0.2647,0.1981,0.4451,0.8353,0.2115,76152318963.0581
2,0.2747,0.3077,0.5547,0.7326,0.2351,62.9024
3,0.2654,0.2367,0.4865,0.8104,0.2119,6.6398
4,0.2893,0.2576,0.5075,0.8009,0.2262,5.0873
Mean,0.2695,0.2571,0.5057,0.791,0.2197,15230463809.6348
Std,0.012,0.0381,0.0381,0.0349,0.0094,30460927576.7117


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Prediction list size: (840,)
Prediction list size: (840,)


In [15]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

def safe_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0  # avoid division by zero
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

corrected_mape_mae = safe_mape(y_test_mae, y_pred_mae)
# Calculate other metrics
mae_mae = mean_absolute_error(y_test_mae, y_pred_mae)
rmse_mae = root_mean_squared_error(y_test_mae, y_pred_mae)
r2_mae = r2_score(y_test_mae, y_pred_mae)

print(f"Corrected MAPE_MAE: {corrected_mape_mae:.2f}%")
print(f"MAE_MAE: {mae_mae:.4f}")
print(f"RMSE_MAE: {rmse_mae:.4f}")
print(f"R2 Score_MAE: {r2_mae:.4f}")

Corrected MAPE_MAE: 0.00%
MAE_MAE: 0.0000
RMSE_MAE: 0.0000
R2 Score_MAE: 1.0000


2. evaluation: Metric- R2

In [16]:
corrected_mape = safe_mape(y_test, y_pred)
# Calculate other metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Corrected MAPE: {corrected_mape:.2f}%")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")


Corrected MAPE: 546.51%
MAE: 0.1097
RMSE: 0.1904
R2 Score: 0.9666


# Model Analysis
https://pycaret.gitbook.io/docs/get-started/quickstart#regression

1. Sort: MAE

In [17]:
from pycaret.regression import evaluate_model
from pycaret.regression import pull
# Setup
regression_setup_mae = setup(
    data=model_data_clean,
    target='target',
    session_id=123,
    fold=5,
    train_size=0.8,
    n_jobs=-1,
    verbose=False,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    #remove_outliers=True,
    #outliers_threshold=0.05
)
evaluate_model(model_mae)
print(best_model_mae)

# Use pull() after evaluate_model or plot_model to grab the results
summary_df_mae = pull()
print(summary_df_mae)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦

[ExtraTreesRegressor(n_jobs=-1, random_state=123), <catboost.core.CatBoostRegressor object at 0x000001C2B842CD60>, XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=-1, num_parallel_tree=None, ...)]
                    Description             Value
0                    Session id               123
1                        Target

2. Sort: R2

In [18]:
# Setup
regression_setup = setup(
    data=model_data_clean,
    target='target',
    session_id=123,
    fold=5,
    train_size=0.8,
    n_jobs=-1,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.9,
    verbose=False,
    #remove_outliers=True,
    #outliers_threshold=0.05
    )

evaluate_model(model)
print(best_model)

# Use pull() after evaluate_model or plot_model to grab the results
summary_df = pull()
print(summary_df)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦

[LGBMRegressor(n_jobs=-1, random_state=123), <catboost.core.CatBoostRegressor object at 0x000001C2B9DA89D0>, RandomForestRegressor(n_jobs=-1, random_state=123)]
                    Description             Value
0                    Session id               123
1                        Target            target
2                   Target type        Regression
3           Original data shape       (4200, 147)
4        Transformed data shape        (4200, 94)
5   Transformed train set shape        (3360, 94)
6    Transformed test set shape         (840, 94)
7              Numeric features               139
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12     Remove multicollinearity              True
13  Multicollinearity threshold               0.9
14               Fold Generator             KFold
15                  Fold Number        