# Training the elastic properties

This notebook goes trough the multi-target usage of MODNet. Either (1) a n-dimensional tree like m-MODNet model can be created, or (2) a simple n-dimensional vector output model.

In [None]:
# notebook dependencies
from modnet.models import MODNetModel
from modnet.preprocessing import MODData
from modnet.hyper_opt import FitGenetic
from modnet.models import MODNetModel
from sklearn.model_selection import train_test_split
import numpy as np
import time

## 1. Loading the dataset and creating the MODData instance

The elastic properties from matminer datasets is used in this example

In [None]:
from matminer.datasets import load_dataset
df = load_dataset("elastic_tensor_2015")
compositions = [s.composition for s in df["structure"]]
G_VRH = df["G_VRH"].values
K_VRH = df["G_VRH"].values
poisson = df["poisson_ratio"].values

In [None]:
data = MODData(materials = compositions,
             targets = np.array([G_VRH,K_VRH,poisson]).T, # one property per column
             target_names = ["G_VRH","K_VRH","p"]
            )
data.featurize()

idx_split = train_test_split(range(len(compositions)), test_size=0.2)
train_data, test_data = data.split(idx_split)

In [None]:
train_data.feature_selection(n_jobs=4, use_precomputed_cross_nmi=True)
train_data.save("data/multi_prop_traindata")
test_data.save("data/multi_prop_testdata")

In [None]:
train_data.df_targets.describe()

# 3. Tree MODNetModel

In [None]:
# loading train and test data
train_data = MODData.load("data/multi_prop_traindata")
test_data = MODData.load("data/multi_prop_testdata")

### Model
Each property is put in a different inner list: 

targets = [[["G_VRH"],["K_VRH"],["p"]]] 

This guarantees that the architecture will split on those properties , with multiple scalar output layers!


In [None]:
# model creation - carefully observe the architecture that contains multiple output layers
model = MODNetModel([[["G_VRH"],["K_VRH"],["p"]]], weights={"G_VRH":1, "K_VRH":1, "p":1})
model.model.summary()

In [None]:
# fitting
model.fit(train_data)

In [None]:
# train - test predictions
train_preds = model.predict(train_data)
test_preds = model.predict(test_data)
train_mae = (train_preds - train_data.df_targets).abs().mean()
test_mae = (test_preds - test_data.df_targets).abs().mean()
print("-> train mae\n{}\n-> test mae\n{}".format(train_mae, test_mae))

## 3. Vector MODNet

In [None]:
# loading train and test data

train_data = MODData.load("data/multi_prop_traindata")
test_data = MODData.load("data/multi_prop_testdata")

### Model
All properties are put in the same inner list: 

targets = [[["G_VRH", "K_VRH", "p"]]] 

This guarantees that the architecture will be have a single output vector!

In [None]:
# model creation - carefully observe the architecture that is fully sequential
model = MODNetModel([[["G_VRH","K_VRH","p"]]], weights={"G_VRH":1})
model.model.summary()

In [None]:
# fitting
model.fit(train_data)

In [None]:
# train - test predictions
train_preds = model.predict(train_data)
test_preds = model.predict(test_data)
train_mae = (train_preds - train_data.df_targets).abs().mean()
test_mae = (test_preds - test_data.df_targets).abs().mean()
print("-> train mae\n{}\n-> test mae\n{}".format(train_mae, test_mae))

# 4. Hyperparameter optimization
More realistically, you will use the FitGenetic class to optimize hyperparameters.
This class contains the targets argument that let you decide wheter a single vector model, or a multiple scalar output model is desired.

*Note 1*

It is also possible to have multiple vector output layers, e.g. targets = [[["p0","p1","p2"],["p3","p4"]]]

Or any combination: [[["p0","p1","p2"],["p3","p4"]],["p5]]]

*Note 2*
When dealing with many properties, gathering them in inner lists (i.e. vector architecture) is recommended, as it will result in faster training times ! 
Example:

In [None]:
train_data = MODData.load("data/multi_prop_traindata")
test_data = MODData.load("data/multi_prop_testdata")

In [None]:
# GA vector output
ga = FitGenetic(train_data, targets = [[["G_VRH","K_VRH","p"]]]) # single vector output architecture
start_t = time.time()
model = ga.run(nested=0, size_pop=10, num_generations=3, n_jobs = 8, refit=1) # small GA, use larger values for better optimization
stop_t = time.time()

train_preds = model.predict(train_data)
test_preds = model.predict(test_data)
train_mae = (train_preds - train_data.df_targets).abs().mean()
test_mae = (test_preds - test_data.df_targets).abs().mean()
print("-> train mae\n{}\n-> test mae\n{}".format(train_mae, test_mae))
print("Hyperopt duration: {}".format(stop_t-start_t))

In [None]:
# GA multi scalar output
ga = FitGenetic(train_data, targets = [[["G_VRH"],["K_VRH"],["p"]]]) # single vector output architecture

start_t = time.time()
model = ga.run(nested=0, size_pop=10, num_generations=3, n_jobs = 8, refit=1)
stop_t = time.time()

train_preds = model.predict(train_data)
test_preds = model.predict(test_data)
train_mae = (train_preds - train_data.df_targets).abs().mean()
test_mae = (test_preds - test_data.df_targets).abs().mean()
print("-> train mae\n{}\n-> test mae\n{}".format(train_mae, test_mae))
print("Hyperopt duration: {}".format(stop_t-start_t))