# Neste Notebook reproduzir resultados de um artigo (Ward, 2017)
## pode demorar até duas horas para executar. São muitos dados.

In [1]:
from platform import python_version

from matplotlib import pyplot as plt
from matminer.datasets import load_dataset
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.composition import ElementProperty, Stoichiometry, ValenceOrbital, IonProperty
from matminer.featurizers.structure import (SiteStatsFingerprint, StructuralHeterogeneity,
                                            ChemicalOrdering, StructureComposition, MaximumPackingEfficiency)
from matminer.featurizers.conversions import DictToObject
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy import stats
from tqdm import tqdm_notebook as tqdm
import numpy as np

print(python_version())

3.8.8


In [2]:
# Temos que calcular os descritores, para isso usamos a ferramenta featurizaer do matminer
featurizer = MultipleFeaturizer([
    SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    MaximumPackingEfficiency(),
    SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
    StructureComposition(Stoichiometry()),
    StructureComposition(ElementProperty.from_preset("magpie")),
    StructureComposition(ValenceOrbital(props=['frac'])),
    StructureComposition(IonProperty(fast=True))
])

In [3]:
# carrega os ados que estão online
data = load_dataset("flla")
print('Loaded {} entries'.format(len(data)))

Fetching flla.json.gz from https://ndownloader.figshare.com/files/13220597 to C:\Users\Ricardo\anaconda3\lib\site-packages\matminer\datasets\flla.json.gz


Fetching https://ndownloader.figshare.com/files/13220597 in MB: 2.607104MB [00:00, 81.54MB/s]                          


Loaded 3938 entries


In [4]:
dto = DictToObject(target_col_id='structure', overwrite_data=True)
data = dto.featurize_dataframe(data, 'structure')
print('Total de descritores:', len(featurizer.featurize(data['structure'][0])))
print('Number of sites in structure:', len(data['structure'][0]))

DictToObject:   0%|          | 0/3938 [00:00<?, ?it/s]

Total de descritores: 273
Number of sites in structure: 2


In [None]:
X = featurizer.featurize_many(data['structure'], ignore_errors=True)

MultipleFeaturizer:   0%|          | 0/3938 [00:00<?, ?it/s]

In [None]:
#usa numpy para converter em vetor
X = np.array(X)
print('Input:', X.shape)

# Treina ML

In [None]:
model = Pipeline([
    ('imputer', SimpleImputer()), # For the failed structures
    ('model', RandomForestRegressor(n_estimators=150, n_jobs=-1))
])

In [None]:
model.fit(X, data['formation_energy_per_atom'])

In [None]:
#calcula o erro
maes = []
for train_ids, test_ids in tqdm(ShuffleSplit(train_size=3000, n_splits=20).split(X)):
    # divide em treino e teste
    train_X = X[train_ids, :]
    train_y = data['formation_energy_per_atom'].iloc[train_ids]
    test_X = X[test_ids, :]
    test_y = data['formation_energy_per_atom'].iloc[test_ids]
    
    # Treina
    model.fit(train_X, train_y)
    
    # calcula o erro (MAE)
    predict_y = model.predict(test_X)
    maes.append(np.abs(test_y - predict_y).mean())

In [None]:
print('MAE: {:.3f}+/-{:.3f} eV/atom'.format(np.mean(maes), stats.sem(maes)))